diff options
Diffstat (limited to 'youtube_dl')
110 files changed, 5048 insertions, 1960 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2187dcc8f..5036289b0 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -64,6 +64,7 @@ from .utils import (      PostProcessingError,      preferredencoding,      prepend_extension, +    register_socks_protocols,      render_table,      replace_extension,      SameFileError, @@ -325,7 +326,7 @@ class YoutubeDL(object):                          ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)                  self._output_channel = os.fdopen(master, 'rb')              except OSError as ose: -                if ose.errno == 2: +                if ose.errno == errno.ENOENT:                      self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')                  else:                      raise @@ -361,6 +362,8 @@ class YoutubeDL(object):          for ph in self.params.get('progress_hooks', []):              self.add_progress_hook(ph) +        register_socks_protocols() +      def warn_if_short_id(self, argv):          # short YouTube ID starting with dash?          idxs = [ @@ -717,6 +720,7 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video')          if result_type in ('url', 'url_transparent'): +            ie_result['url'] = sanitize_url(ie_result['url'])              extract_flat = self.params.get('extract_flat', False)              if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or                      extract_flat is True): @@ -1219,6 +1223,10 @@ class YoutubeDL(object):          if 'title' not in info_dict:              raise ExtractorError('Missing "title" field in extractor result') +        if not isinstance(info_dict['id'], compat_str): +            self.report_warning('"id" field is not a string - forcing string conversion') +            info_dict['id'] = compat_str(info_dict['id']) +          if 'playlist' not in info_dict:              # It isn't part of a playlist              info_dict['playlist'] = None @@ -2018,6 +2026,7 @@ class YoutubeDL(object):          if opts_cookiefile is None:              self.cookiejar = compat_cookiejar.CookieJar()          else: +            opts_cookiefile = compat_expanduser(opts_cookiefile)              self.cookiejar = compat_cookiejar.MozillaCookieJar(                  opts_cookiefile)              if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..4905674ad 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -18,7 +18,6 @@ from .options import (  from .compat import (      compat_expanduser,      compat_getpass, -    compat_print,      compat_shlex_split,      workaround_optparse_bug9161,  ) @@ -67,16 +66,16 @@ def _real_main(argv=None):      # Custom HTTP headers      if opts.headers is not None:          for h in opts.headers: -            if h.find(':', 1) < 0: +            if ':' not in h:                  parser.error('wrong header formatting, it should be key:value, not "%s"' % h) -            key, value = h.split(':', 2) +            key, value = h.split(':', 1)              if opts.verbose:                  write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))              std_headers[key] = value      # Dump user agent      if opts.dump_user_agent: -        compat_print(std_headers['User-Agent']) +        write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)          sys.exit(0)      # Batch file verification @@ -86,7 +85,9 @@ def _real_main(argv=None):              if opts.batchfile == '-':                  batchfd = sys.stdin              else: -                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') +                batchfd = io.open( +                    compat_expanduser(opts.batchfile), +                    'r', encoding='utf-8', errors='ignore')              batch_urls = read_batch_urls(batchfd)              if opts.verbose:                  write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') @@ -99,10 +100,10 @@ def _real_main(argv=None):      if opts.list_extractors:          for ie in list_extractors(opts.age_limit): -            compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '')) +            write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)              matchedUrls = [url for url in all_urls if ie.suitable(url)]              for mu in matchedUrls: -                compat_print('  ' + mu) +                write_string('  ' + mu + '\n', out=sys.stdout)          sys.exit(0)      if opts.list_extractor_descriptions:          for ie in list_extractors(opts.age_limit): @@ -115,7 +116,7 @@ def _real_main(argv=None):                  _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')                  _COUNTS = ('', '5', '10', 'all')                  desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) -            compat_print(desc) +            write_string(desc + '\n', out=sys.stdout)          sys.exit(0)      # Conflicting, missing and erroneous options @@ -404,7 +405,7 @@ def _real_main(argv=None):          try:              if opts.load_info_filename is not None: -                retcode = ydl.download_with_info_file(opts.load_info_filename) +                retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))              else:                  retcode = ydl.download(all_urls)          except MaxDownloadsReached: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0b6c5ca7a..e3cab4dd0 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -11,6 +11,7 @@ import re  import shlex  import shutil  import socket +import struct  import subprocess  import sys  import itertools @@ -244,13 +245,20 @@ try:  except ImportError:  # Python 2.6      from xml.parsers.expat import ExpatError as compat_xml_parse_error + +etree = xml.etree.ElementTree + + +class _TreeBuilder(etree.TreeBuilder): +    def doctype(self, name, pubid, system): +        pass +  if sys.version_info[0] >= 3: -    compat_etree_fromstring = xml.etree.ElementTree.fromstring +    def compat_etree_fromstring(text): +        return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))  else:      # python 2.x tries to encode unicode strings with ascii (see the      # XMLParser._fixtext method) -    etree = xml.etree.ElementTree -      try:          _etree_iter = etree.Element.iter      except AttributeError:  # Python <=2.6 @@ -264,7 +272,7 @@ else:      # 2.7 source      def _XML(text, parser=None):          if not parser: -            parser = etree.XMLParser(target=etree.TreeBuilder()) +            parser = etree.XMLParser(target=_TreeBuilder())          parser.feed(text)          return parser.close() @@ -276,7 +284,7 @@ else:          return el      def compat_etree_fromstring(text): -        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) +        doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))          for el in _etree_iter(doc):              if el.text is not None and isinstance(el.text, bytes):                  el.text = el.text.decode('utf-8') @@ -340,9 +348,9 @@ except ImportError:  # Python 2          return parsed_result  try: -    from shlex import quote as shlex_quote +    from shlex import quote as compat_shlex_quote  except ImportError:  # Python < 3.3 -    def shlex_quote(s): +    def compat_shlex_quote(s):          if re.match(r'^[-_\w./]+$', s):              return s          else: @@ -373,6 +381,9 @@ compat_os_name = os._name if os.name == 'java' else os.name  if sys.version_info >= (3, 0):      compat_getenv = os.getenv      compat_expanduser = os.path.expanduser + +    def compat_setenv(key, value, env=os.environ): +        env[key] = value  else:      # Environment variables should be decoded with filesystem encoding.      # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) @@ -384,6 +395,12 @@ else:              env = env.decode(get_filesystem_encoding())          return env +    def compat_setenv(key, value, env=os.environ): +        def encode(v): +            from .utils import get_filesystem_encoding +            return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v +        env[encode(key)] = encode(value) +      # HACK: The default implementations of os.path.expanduser from cpython do not decode      # environment variables with filesystem encoding. We will work around this by      # providing adjusted implementations. @@ -456,18 +473,6 @@ else:          print(s) -try: -    subprocess_check_output = subprocess.check_output -except AttributeError: -    def subprocess_check_output(*args, **kwargs): -        assert 'input' not in kwargs -        p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) -        output, _ = p.communicate() -        ret = p.poll() -        if ret: -            raise subprocess.CalledProcessError(ret, p.args, output=output) -        return output -  if sys.version_info < (3, 0) and sys.platform == 'win32':      def compat_getpass(prompt, *args, **kwargs):          if isinstance(prompt, compat_str): @@ -477,6 +482,11 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':  else:      compat_getpass = getpass.getpass +try: +    compat_input = raw_input +except NameError:  # Python 3 +    compat_input = input +  # Python < 2.6.5 require kwargs to be bytes  try:      def _testfunc(x): @@ -583,6 +593,26 @@ if sys.version_info >= (3, 0):  else:      from tokenize import generate_tokens as compat_tokenize_tokenize + +try: +    struct.pack('!I', 0) +except TypeError: +    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument +    # See https://bugs.python.org/issue19099 +    def compat_struct_pack(spec, *args): +        if isinstance(spec, compat_str): +            spec = spec.encode('ascii') +        return struct.pack(spec, *args) + +    def compat_struct_unpack(spec, *args): +        if isinstance(spec, compat_str): +            spec = spec.encode('ascii') +        return struct.unpack(spec, *args) +else: +    compat_struct_pack = struct.pack +    compat_struct_unpack = struct.unpack + +  __all__ = [      'compat_HTMLParser',      'compat_HTTPError', @@ -598,15 +628,20 @@ __all__ = [      'compat_html_entities',      'compat_http_client',      'compat_http_server', +    'compat_input',      'compat_itertools_count',      'compat_kwargs',      'compat_ord',      'compat_os_name',      'compat_parse_qs',      'compat_print', +    'compat_setenv', +    'compat_shlex_quote',      'compat_shlex_split',      'compat_socket_create_connection',      'compat_str', +    'compat_struct_pack', +    'compat_struct_unpack',      'compat_subprocess_get_DEVNULL',      'compat_tokenize_tokenize',      'compat_urllib_error', @@ -623,7 +658,5 @@ __all__ = [      'compat_urlretrieve',      'compat_xml_parse_error',      'compat_xpath', -    'shlex_quote', -    'subprocess_check_output',      'workaround_optparse_bug9161',  ] diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 8d642fc3e..3ff1f9ed4 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,6 +6,7 @@ import sys  import re  from .common import FileDownloader +from ..compat import compat_setenv  from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS  from ..utils import (      cli_option, @@ -198,6 +199,19 @@ class FFmpegFD(ExternalFD):                  '-headers',                  ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] +        env = None +        proxy = self.params.get('proxy') +        if proxy: +            if not re.match(r'^[\da-zA-Z]+://', proxy): +                proxy = 'http://%s' % proxy +            # Since December 2015 ffmpeg supports -http_proxy option (see +            # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) +            # We could switch to the following code if we are able to detect version properly +            # args += ['-http_proxy', proxy] +            env = os.environ.copy() +            compat_setenv('HTTP_PROXY', proxy, env=env) +            compat_setenv('http_proxy', proxy, env=env) +          protocol = info_dict.get('protocol')          if protocol == 'rtmp': @@ -224,7 +238,7 @@ class FFmpegFD(ExternalFD):                  args += ['-rtmp_live', 'live']          args += ['-i', url, '-c', 'copy'] -        if protocol == 'm3u8': +        if protocol in ('m3u8', 'm3u8_native'):              if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':                  args += ['-f', 'mpegts']              else: @@ -239,7 +253,7 @@ class FFmpegFD(ExternalFD):          self._debug_cmd(args) -        proc = subprocess.Popen(args, stdin=subprocess.PIPE) +        proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)          try:              retval = proc.wait()          except KeyboardInterrupt: diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 664d87543..8f88b0241 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,37 +12,49 @@ from ..compat import (      compat_urlparse,      compat_urllib_error,      compat_urllib_parse_urlparse, +    compat_struct_pack, +    compat_struct_unpack,  )  from ..utils import (      encodeFilename,      fix_xml_ampersands,      sanitize_open, -    struct_pack, -    struct_unpack,      xpath_text,  ) +class DataTruncatedError(Exception): +    pass + +  class FlvReader(io.BytesIO):      """      Reader for Flv files      The file format is documented in https://www.adobe.com/devnet/f4v.html      """ +    def read_bytes(self, n): +        data = self.read(n) +        if len(data) < n: +            raise DataTruncatedError( +                'FlvReader error: need %d bytes while only %d bytes got' % ( +                    n, len(data))) +        return data +      # Utility functions for reading numbers and strings      def read_unsigned_long_long(self): -        return struct_unpack('!Q', self.read(8))[0] +        return compat_struct_unpack('!Q', self.read_bytes(8))[0]      def read_unsigned_int(self): -        return struct_unpack('!I', self.read(4))[0] +        return compat_struct_unpack('!I', self.read_bytes(4))[0]      def read_unsigned_char(self): -        return struct_unpack('!B', self.read(1))[0] +        return compat_struct_unpack('!B', self.read_bytes(1))[0]      def read_string(self):          res = b''          while True: -            char = self.read(1) +            char = self.read_bytes(1)              if char == b'\x00':                  break              res += char @@ -53,18 +65,18 @@ class FlvReader(io.BytesIO):          Read a box and return the info as a tuple: (box_size, box_type, box_data)          """          real_size = size = self.read_unsigned_int() -        box_type = self.read(4) +        box_type = self.read_bytes(4)          header_end = 8          if size == 1:              real_size = self.read_unsigned_long_long()              header_end = 16 -        return real_size, box_type, self.read(real_size - header_end) +        return real_size, box_type, self.read_bytes(real_size - header_end)      def read_asrt(self):          # version          self.read_unsigned_char()          # flags -        self.read(3) +        self.read_bytes(3)          quality_entry_count = self.read_unsigned_char()          # QualityEntryCount          for i in range(quality_entry_count): @@ -85,7 +97,7 @@ class FlvReader(io.BytesIO):          # version          self.read_unsigned_char()          # flags -        self.read(3) +        self.read_bytes(3)          # time scale          self.read_unsigned_int() @@ -119,7 +131,7 @@ class FlvReader(io.BytesIO):          # version          self.read_unsigned_char()          # flags -        self.read(3) +        self.read_bytes(3)          self.read_unsigned_int()  # BootstrapinfoVersion          # Profile,Live,Update,Reserved @@ -194,11 +206,11 @@ def build_fragments_list(boot_info):  def write_unsigned_int(stream, val): -    stream.write(struct_pack('!I', val)) +    stream.write(compat_struct_pack('!I', val))  def write_unsigned_int_24(stream, val): -    stream.write(struct_pack('!I', val)[1:]) +    stream.write(compat_struct_pack('!I', val)[1:])  def write_flv_header(stream): @@ -307,7 +319,7 @@ class F4mFD(FragmentFD):          doc = compat_etree_fromstring(manifest)          formats = [(int(f.attrib.get('bitrate', -1)), f)                     for f in self._get_unencrypted_media(doc)] -        if requested_bitrate is None: +        if requested_bitrate is None or len(formats) == 1:              # get the best format              formats = sorted(formats, key=lambda f: f[0])              rate, media = formats[-1] @@ -374,7 +386,17 @@ class F4mFD(FragmentFD):                  down.close()                  reader = FlvReader(down_data)                  while True: -                    _, box_type, box_data = reader.read_box_info() +                    try: +                        _, box_type, box_data = reader.read_box_info() +                    except DataTruncatedError: +                        if test: +                            # In tests, segments may be truncated, and thus +                            # FlvReader may not be able to parse the whole +                            # chunk. If so, write the segment as is +                            # See https://github.com/rg3/youtube-dl/issues/9214 +                            dest_stream.write(down_data) +                            break +                        raise                      if box_type == b'mdat':                          dest_stream.write(box_data)                          break diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a01dac031..54f2108e9 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,6 +4,7 @@ import os.path  import re  from .fragment import FragmentFD +from .external import FFmpegFD  from ..compat import compat_urlparse  from ..utils import ( @@ -17,12 +18,45 @@ class HlsFD(FragmentFD):      FD_NAME = 'hlsnative' +    @staticmethod +    def can_download(manifest): +        UNSUPPORTED_FEATURES = ( +            r'#EXT-X-KEY:METHOD=(?!NONE)',  # encrypted streams [1] +            r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2] + +            # Live streams heuristic does not always work (e.g. geo restricted to Germany +            # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) +            # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3] + +            # This heuristic also is not correct since segments may not be appended as well. +            # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite +            # no segments will definitely be appended to the end of the playlist. +            # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of +            #                                 # event media playlists [4] + +            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 +            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 +            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 +            # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 +        ) +        return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) +      def real_download(self, filename, info_dict):          man_url = info_dict['url']          self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)          manifest = self.ydl.urlopen(man_url).read()          s = manifest.decode('utf-8', 'ignore') + +        if not self.can_download(s): +            self.report_warning( +                'hlsnative has detected features it does not support, ' +                'extraction will be delegated to ffmpeg') +            fd = FFmpegFD(self.ydl, self.params) +            for ph in self._progress_hooks: +                fd.add_progress_hook(ph) +            return fd.real_download(filename, info_dict) +          fragment_urls = []          for line in s.splitlines():              line = line.strip() diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py new file mode 100644 index 000000000..b61a6327c --- /dev/null +++ b/youtube_dl/extractor/abcnews.py @@ -0,0 +1,135 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import re +import time + +from .amp import AMPIE +from .common import InfoExtractor +from ..compat import compat_urlparse + + +class AbcNewsVideoIE(AMPIE): +    IE_NAME = 'abcnews:video' +    _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + +    _TESTS = [{ +        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', +        'info_dict': { +            'id': '20411932', +            'ext': 'mp4', +            'display_id': 'week-exclusive-irans-foreign-minister-zarif', +            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif', +            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', +            'duration': 180, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('display_id') +        video_id = mobj.group('id') +        info_dict = self._extract_feed_info( +            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id) +        info_dict.update({ +            'id': video_id, +            'display_id': display_id, +        }) +        return info_dict + + +class AbcNewsIE(InfoExtractor): +    IE_NAME = 'abcnews' +    _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' + +    _TESTS = [{ +        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', +        'info_dict': { +            'id': '10498713', +            'ext': 'flv', +            'display_id': 'dramatic-video-rare-death-job-america', +            'title': 'Occupational Hazards', +            'description': 'Nightline investigates the dangers that lurk at various jobs.', +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20100428', +            'timestamp': 1272412800, +        }, +        'add_ie': ['AbcNewsVideo'], +    }, { +        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', +        'info_dict': { +            'id': '39125818', +            'ext': 'mp4', +            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', +            'title': 'Justin Timberlake Drops Hints For Secret Single', +            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', +            'upload_date': '20160515', +            'timestamp': 1463329500, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +            # The embedded YouTube video is blocked due to copyright issues +            'playlist_items': '1', +        }, +        'add_ie': ['AbcNewsVideo'], +    }, { +        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id = mobj.group('display_id') +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        video_url = self._search_regex( +            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') +        full_video_url = compat_urlparse.urljoin(url, video_url) + +        youtube_url = self._html_search_regex( +            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', +            webpage, 'YouTube URL', default=None) + +        timestamp = None +        date_str = self._html_search_regex( +            r'<span[^>]+class="timestamp">([^<]+)</span>', +            webpage, 'timestamp', fatal=False) +        if date_str: +            tz_offset = 0 +            if date_str.endswith(' ET'):  # Eastern Time +                tz_offset = -5 +                date_str = date_str[:-3] +            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] +            for date_format in date_formats: +                try: +                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) +                except ValueError: +                    continue +            if timestamp is not None: +                timestamp -= tz_offset * 3600 + +        entry = { +            '_type': 'url_transparent', +            'ie_key': AbcNewsVideoIE.ie_key(), +            'url': full_video_url, +            'id': video_id, +            'display_id': display_id, +            'timestamp': timestamp, +        } + +        if youtube_url: +            entries = [entry, self.url_result(youtube_url, 'Youtube')] +            return self.playlist_result(entries) + +        return entry diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 138fa0808..8545681be 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -52,7 +52,7 @@ class AMPIE(InfoExtractor):          for media_data in media_content:              media = media_data['@attributes']              media_type = media['type'] -            if media_type == 'video/f4m': +            if media_type in ('video/f4m', 'application/f4m+xml'):                  formats.extend(self._extract_f4m_formats(                      media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',                      video_id, f4m_id='hds', fatal=False)) @@ -61,7 +61,7 @@ class AMPIE(InfoExtractor):                      media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))              else:                  formats.append({ -                    'format_id': media_data['media-category']['@attributes']['label'], +                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),                      'url': media['url'],                      'tbr': int_or_none(media.get('bitrate')),                      'filesize': int_or_none(media.get('fileSize')), diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py new file mode 100644 index 000000000..cb29cf111 --- /dev/null +++ b/youtube_dl/extractor/anvato.py @@ -0,0 +1,224 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import hashlib +import json +import random +import time + +from .common import InfoExtractor +from ..aes import aes_encrypt +from ..compat import compat_str +from ..utils import ( +    bytes_to_intlist, +    determine_ext, +    intlist_to_bytes, +    int_or_none, +    strip_jsonp, +) + + +def md5_text(s): +    if not isinstance(s, compat_str): +        s = compat_str(s) +    return hashlib.md5(s.encode('utf-8')).hexdigest() + + +class AnvatoIE(InfoExtractor): +    # Copied from anvplayer.min.js +    _ANVACK_TABLE = { +        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', +        'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA', +        'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP', +        'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv', +        'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7', +        'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR', +        'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg', +        'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto', +        'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY', +        'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh', +        'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK', +        'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D', +        'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad', +        'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp', +        'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih', +        'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR', +        'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW', +        'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su', +        'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q', +        'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5', +        'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3', +        'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI', +        'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s', +        'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz', +        'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg', +        'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x', +        'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH', +        'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX', +        'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc', +        'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK', +        'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7', +        'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C', +        'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e', +        'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1', +        'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re', +        'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51', +        'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho', +        'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9', +        'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH', +        'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F', +        'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo', +        'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR', +        'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa', +        'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk', +        'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ', +        'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ', +        'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m', +        'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b', +        'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3', +        'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK', +        'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', +        'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua', +        'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F', +        'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx', +        'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ', +        'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH', +        'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm', +        'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt', +        'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl', +        'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b', +        'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV', +        'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg', +        'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk', +        'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT', +        'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa', +        'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv', +        'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k', +        'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI', +        'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr', +        'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw', +        'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K', +        'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH', +        'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK', +        'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu', +        'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', +        'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg', +        'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK', +        'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n', +        'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD', +        'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk', +        'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', +        'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', +        'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', +        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' +    } + +    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' + +    def __init__(self, *args, **kwargs): +        super(AnvatoIE, self).__init__(*args, **kwargs) +        self.__server_time = None + +    def _server_time(self, access_key, video_id): +        if self.__server_time is not None: +            return self.__server_time + +        self.__server_time = int(self._download_json( +            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, +            note='Fetching server time')['server_time']) + +        return self.__server_time + +    def _api_prefix(self, access_key): +        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') + +    def _get_video_json(self, access_key, video_id): +        # See et() in anvplayer.min.js, which is an alias of getVideoJSON() +        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) +        server_time = self._server_time(access_key, video_id) +        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + +        auth_secret = intlist_to_bytes(aes_encrypt( +            bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) + +        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') +        anvrid = md5_text(time.time() * 1000 * random.random())[:30] +        payload = { +            'api': { +                'anvrid': anvrid, +                'anvstk': md5_text('%s|%s|%d|%s' % ( +                    access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])), +                'anvts': server_time, +            }, +        } + +        return self._download_json( +            video_data_url, video_id, transform_source=strip_jsonp, +            data=json.dumps(payload).encode('utf-8')) + +    def _extract_anvato_videos(self, webpage, video_id): +        anvplayer_data = self._parse_json(self._html_search_regex( +            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, +            'Anvato player data'), video_id) + +        video_id = anvplayer_data['video'] +        access_key = anvplayer_data['accessKey'] + +        video_data = self._get_video_json(access_key, video_id) + +        formats = [] +        for published_url in video_data['published_urls']: +            video_url = published_url['embed_url'] +            ext = determine_ext(video_url) + +            if ext == 'smil': +                formats.extend(self._extract_smil_formats(video_url, video_id)) +                continue + +            tbr = int_or_none(published_url.get('kbps')) +            a_format = { +                'url': video_url, +                'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), +                'tbr': tbr if tbr != 0 else None, +            } + +            if ext == 'm3u8': +                # Not using _extract_m3u8_formats here as individual media +                # playlists are also included in published_urls. +                if tbr is None: +                    formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) +                    continue +                else: +                    a_format.update({ +                        'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), +                        'ext': 'mp4', +                    }) +            elif ext == 'mp3': +                a_format['vcodec'] = 'none' +            else: +                a_format.update({ +                    'width': int_or_none(published_url.get('width')), +                    'height': int_or_none(published_url.get('height')), +                }) +            formats.append(a_format) + +        self._sort_formats(formats) + +        subtitles = {} +        for caption in video_data.get('captions', []): +            a_caption = { +                'url': caption['url'], +                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None +            } +            subtitles.setdefault(caption['language'], []).append(a_caption) + +        return { +            'id': video_id, +            'formats': formats, +            'title': video_data.get('def_title'), +            'description': video_data.get('def_description'), +            'categories': video_data.get('categories'), +            'thumbnail': video_data.get('thumbnail'), +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..f40532929 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):          } -class ArteTVPlus7IE(InfoExtractor): -    IE_NAME = 'arte.tv:+7' -    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' - +class ArteTVBaseIE(InfoExtractor):      @classmethod      def _extract_url_info(cls, url):          mobj = re.match(cls._VALID_URL, url) @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor):              video_id = mobj.group('id')          return video_id, lang -    def _real_extract(self, url): -        video_id, lang = self._extract_url_info(url) -        webpage = self._download_webpage(url, video_id) -        return self._extract_from_webpage(webpage, video_id, lang) - -    def _extract_from_webpage(self, webpage, video_id, lang): -        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') -        ids = (video_id, '') -        # some pages contain multiple videos (like -        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), -        # so we first try to look for json URLs that contain the video id from -        # the 'vid' parameter. -        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] -        json_url = self._html_search_regex( -            patterns, webpage, 'json vp url', default=None) -        if not json_url: -            def find_iframe_url(webpage, default=NO_DEFAULT): -                return self._html_search_regex( -                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', -                    webpage, 'iframe url', group='url', default=default) - -            iframe_url = find_iframe_url(webpage, None) -            if not iframe_url: -                embed_url = self._html_search_regex( -                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) -                if embed_url: -                    player = self._download_json( -                        embed_url, video_id, 'Downloading player page') -                    iframe_url = find_iframe_url(player['html']) -            # en and es URLs produce react-based pages with different layout (e.g. -            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) -            if not iframe_url: -                program = self._search_regex( -                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', -                    webpage, 'program', default=None) -                if program: -                    embed_html = self._parse_json(program, video_id) -                    if embed_html: -                        iframe_url = find_iframe_url(embed_html['embed_html']) -            if iframe_url: -                json_url = compat_parse_qs( -                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] -        if json_url: -            title = self._search_regex( -                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', -                webpage, 'title', default=None, group='title') -            return self._extract_from_json_url(json_url, video_id, lang, title=title) -        # Different kind of embed URL (e.g. -        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) -        embed_url = self._search_regex( -            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', -            webpage, 'embed url', group='url') -        return self.url_result(embed_url) -      def _extract_from_json_url(self, json_url, video_id, lang, title=None):          info = self._download_json(json_url, video_id)          player_info = info['videoJsonPlayer'] @@ -161,24 +104,53 @@ class ArteTVPlus7IE(InfoExtractor):              'es': 'E[ESP]',          } +        langcode = LANGS.get(lang, lang) +          formats = []          for format_id, format_dict in player_info['VSR'].items():              f = dict(format_dict)              versionCode = f.get('versionCode') -            langcode = LANGS.get(lang, lang) -            lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] -            lang_pref = None -            if versionCode: -                matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] -                lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) -            source_pref = 0 -            if versionCode is not None: -                # The original version with subtitles has lower relevance -                if re.match(r'VO-ST(F|A|E)', versionCode): -                    source_pref -= 10 -                # The version with sourds/mal subtitles has also lower relevance -                elif re.match(r'VO?(F|A|E)-STM\1', versionCode): -                    source_pref -= 9 +            l = re.escape(langcode) + +            # Language preference from most to least priority +            # Reference: section 5.6.3 of +            # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf +            PREFERENCES = ( +                # original version in requested language, without subtitles +                r'VO{0}$'.format(l), +                # original version in requested language, with partial subtitles in requested language +                r'VO{0}-ST{0}$'.format(l), +                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language +                r'VO{0}-STM{0}$'.format(l), +                # non-original (dubbed) version in requested language, without subtitles +                r'V{0}$'.format(l), +                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language +                r'V{0}-ST{0}$'.format(l), +                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language +                r'V{0}-STM{0}$'.format(l), +                # original version in requested language, with partial subtitles in different language +                r'VO{0}-ST(?!{0}).+?$'.format(l), +                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language +                r'VO{0}-STM(?!{0}).+?$'.format(l), +                # original version in different language, with partial subtitles in requested language +                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), +                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language +                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), +                # original version in different language, without subtitles +                r'VO(?:(?!{0}))?$'.format(l), +                # original version in different language, with partial subtitles in different language +                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), +                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language +                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), +            ) + +            for pref, p in enumerate(PREFERENCES): +                if re.match(p, versionCode): +                    lang_pref = len(PREFERENCES) - pref +                    break +            else: +                lang_pref = -1 +              format = {                  'format_id': format_id,                  'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +160,6 @@ class ArteTVPlus7IE(InfoExtractor):                  'height': int_or_none(f.get('height')),                  'tbr': int_or_none(f.get('bitrate')),                  'quality': qfunc(f.get('quality')), -                'source_preference': source_pref,              }              if f.get('mediaType') == 'rtmp': @@ -207,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor):          return info_dict +class ArteTVPlus7IE(ArteTVBaseIE): +    IE_NAME = 'arte.tv:+7' +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) + +    def _real_extract(self, url): +        video_id, lang = self._extract_url_info(url) +        webpage = self._download_webpage(url, video_id) +        return self._extract_from_webpage(webpage, video_id, lang) + +    def _extract_from_webpage(self, webpage, video_id, lang): +        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') +        ids = (video_id, '') +        # some pages contain multiple videos (like +        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), +        # so we first try to look for json URLs that contain the video id from +        # the 'vid' parameter. +        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] +        json_url = self._html_search_regex( +            patterns, webpage, 'json vp url', default=None) +        if not json_url: +            def find_iframe_url(webpage, default=NO_DEFAULT): +                return self._html_search_regex( +                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', +                    webpage, 'iframe url', group='url', default=default) + +            iframe_url = find_iframe_url(webpage, None) +            if not iframe_url: +                embed_url = self._html_search_regex( +                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) +                if embed_url: +                    player = self._download_json( +                        embed_url, video_id, 'Downloading player page') +                    iframe_url = find_iframe_url(player['html']) +            # en and es URLs produce react-based pages with different layout (e.g. +            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) +            if not iframe_url: +                program = self._search_regex( +                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', +                    webpage, 'program', default=None) +                if program: +                    embed_html = self._parse_json(program, video_id) +                    if embed_html: +                        iframe_url = find_iframe_url(embed_html['embed_html']) +            if iframe_url: +                json_url = compat_parse_qs( +                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] +        if json_url: +            title = self._search_regex( +                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', +                webpage, 'title', default=None, group='title') +            return self._extract_from_json_url(json_url, video_id, lang, title=title) +        # Different kind of embed URL (e.g. +        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) +        embed_url = self._search_regex( +            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', +            webpage, 'embed url', group='url') +        return self.url_result(embed_url) + +  # It also uses the arte_vp_url url from the webpage to extract the information  class ArteTVCreativeIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:creative' @@ -239,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:info'      _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',          'info_dict': {              'id': '067528-000-A', @@ -247,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):              'title': 'Service civique, un cache misère ?',              'upload_date': '20160403',          }, -    } +    }]  class ArteTVFutureIE(ArteTVPlus7IE): @@ -272,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:ddc'      _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' +    _TESTS = [] +      def _real_extract(self, url):          video_id, lang = self._extract_url_info(url)          if lang == 'folge': @@ -290,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:concert'      _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' -    _TEST = { +    _TESTS = [{          'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',          'md5': '9ea035b7bd69696b67aa2ccaaa218161',          'info_dict': { @@ -300,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE):              'upload_date': '20140128',              'description': 'md5:486eb08f991552ade77439fe6d82c305',          }, -    } +    }]  class ArteTVCinemaIE(ArteTVPlus7IE):      IE_NAME = 'arte.tv:cinema'      _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' -    _TEST = { +    _TESTS = [{          'url': 'http://cinema.arte.tv/de/node/38291',          'md5': '6b275511a5107c60bacbeeda368c3aa1',          'info_dict': { @@ -317,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE):              'upload_date': '20160122',              'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',          }, -    } +    }]  class ArteTVMagazineIE(ArteTVPlus7IE): @@ -362,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE):          )      ''' +    _TESTS = [] +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id')          lang = mobj.group('lang')          json_url = mobj.group('json_url')          return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVPlaylistIE(ArteTVBaseIE): +    IE_NAME = 'arte.tv:playlist' +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' + +    _TESTS = [{ +        'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', +        'info_dict': { +            'id': 'PL-013263', +            'title': 'Areva & Uramin', +        }, +        'playlist_mincount': 6, +    }, { +        'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id, lang = self._extract_url_info(url) +        collection = self._download_json( +            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' +            % (lang, playlist_id), playlist_id) +        title = collection.get('title') +        description = collection.get('shortDescription') or collection.get('teaserText') +        entries = [ +            self._extract_from_json_url( +                video['jsonUrl'], video.get('programId') or playlist_id, lang) +            for video in collection['videos'] if video.get('jsonUrl')] +        return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index c1ef8051d..991ab0676 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor):          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }, {          'url': 'http://benprunty.bandcamp.com/track/lanius-battle', -        'md5': '2b68e5851514c20efdff2afc5603b8b4', +        'md5': '73d0b3171568232574e45652f8720b5c',          'info_dict': {              'id': '2650410135',              'ext': 'mp3', @@ -48,6 +48,10 @@ class BandcampIE(InfoExtractor):              if m_trackinfo:                  json_code = m_trackinfo.group(1)                  data = json.loads(json_code)[0] +                track_id = compat_str(data['id']) + +                if not data.get('file'): +                    raise ExtractorError('Not streamable', video_id=track_id, expected=True)                  formats = []                  for format_id, format_url in data['file'].items(): @@ -64,7 +68,7 @@ class BandcampIE(InfoExtractor):                  self._sort_formats(formats)                  return { -                    'id': compat_str(data['id']), +                    'id': track_id,                      'title': data['title'],                      'formats': formats,                      'duration': float_or_none(data.get('duration')), diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 8baff2041..b17047b39 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,34 +1,42 @@  # coding: utf-8  from __future__ import unicode_literals +import calendar +import datetime  import re  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_etree_fromstring, +    compat_str, +    compat_parse_qs, +    compat_xml_parse_error, +)  from ..utils import ( -    int_or_none, -    unescapeHTML,      ExtractorError, +    int_or_none, +    float_or_none,      xpath_text,  )  class BiliBiliIE(InfoExtractor): -    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' +    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.bilibili.tv/video/av1074402/', -        'md5': '2c301e4dab317596e837c3e7633e7d86', +        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',          'info_dict': {              'id': '1554319',              'ext': 'flv',              'title': '【金坷垃】金泡沫', -            'duration': 308313, +            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', +            'duration': 308.067, +            'timestamp': 1398012660,              'upload_date': '20140420',              'thumbnail': 're:^https?://.+\.jpg', -            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', -            'timestamp': 1397983878,              'uploader': '菊子桑', +            'uploader_id': '156160',          },      }, {          'url': 'http://www.bilibili.com/video/av1041170/', @@ -36,75 +44,186 @@ class BiliBiliIE(InfoExtractor):              'id': '1041170',              'title': '【BD1080P】刀语【诸神&异域】',              'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', -            'uploader': '枫叶逝去', -            'timestamp': 1396501299,          },          'playlist_count': 9, +    }, { +        'url': 'http://www.bilibili.com/video/av4808130/', +        'info_dict': { +            'id': '4808130', +            'title': '【长篇】哆啦A梦443【钉铛】', +            'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', +        }, +        'playlist': [{ +            'md5': '55cdadedf3254caaa0d5d27cf20a8f9c', +            'info_dict': { +                'id': '4808130_part1', +                'ext': 'flv', +                'title': '【长篇】哆啦A梦443【钉铛】', +                'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', +                'timestamp': 1464564180, +                'upload_date': '20160529', +                'uploader': '喜欢拉面', +                'uploader_id': '151066', +            }, +        }, { +            'md5': '926f9f67d0c482091872fbd8eca7ea3d', +            'info_dict': { +                'id': '4808130_part2', +                'ext': 'flv', +                'title': '【长篇】哆啦A梦443【钉铛】', +                'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', +                'timestamp': 1464564180, +                'upload_date': '20160529', +                'uploader': '喜欢拉面', +                'uploader_id': '151066', +            }, +        }, { +            'md5': '4b7b225b968402d7c32348c646f1fd83', +            'info_dict': { +                'id': '4808130_part3', +                'ext': 'flv', +                'title': '【长篇】哆啦A梦443【钉铛】', +                'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', +                'timestamp': 1464564180, +                'upload_date': '20160529', +                'uploader': '喜欢拉面', +                'uploader_id': '151066', +            }, +        }, { +            'md5': '7b795e214166501e9141139eea236e91', +            'info_dict': { +                'id': '4808130_part4', +                'ext': 'flv', +                'title': '【长篇】哆啦A梦443【钉铛】', +                'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', +                'timestamp': 1464564180, +                'upload_date': '20160529', +                'uploader': '喜欢拉面', +                'uploader_id': '151066', +            }, +        }], +    }, { +        # Missing upload time +        'url': 'http://www.bilibili.com/video/av1867637/', +        'info_dict': { +            'id': '2880301', +            'ext': 'flv', +            'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', +            'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', +            'uploader': '黑夜为猫', +            'uploader_id': '610729', +        }, +        'params': { +            # Just to test metadata extraction +            'skip_download': True, +        }, +        'expected_warnings': ['upload time'],      }] +    # BiliBili blocks keys from time to time. The current key is extracted from +    # the Android client +    # TODO: find the sign algorithm used in the flash player +    _APP_KEY = '86385cdc024c0f6c' +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        page_num = mobj.group('page_num') or '1' -        view_data = self._download_json( -            'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), -            video_id) -        if 'error' in view_data: -            raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) +        webpage = self._download_webpage(url, video_id) -        cid = view_data['cid'] -        title = unescapeHTML(view_data['title']) +        params = compat_parse_qs(self._search_regex( +            [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', +             r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], +            webpage, 'player parameters')) +        cid = params['cid'][0] -        doc = self._download_xml( -            'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, -            cid, -            'Downloading page %s/%s' % (page_num, view_data['pages']) -        ) +        info_xml_str = self._download_webpage( +            'http://interface.bilibili.com/v_cdn_play', +            cid, query={'appkey': self._APP_KEY, 'cid': cid}, +            note='Downloading video info page') + +        err_msg = None +        durls = None +        info_xml = None +        try: +            info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) +        except compat_xml_parse_error: +            info_json = self._parse_json(info_xml_str, video_id, fatal=False) +            err_msg = (info_json or {}).get('error_text') +        else: +            err_msg = xpath_text(info_xml, './message') -        if xpath_text(doc, './result') == 'error': -            raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True) +        if info_xml is not None: +            durls = info_xml.findall('./durl') +        if not durls: +            if err_msg: +                raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) +            else: +                raise ExtractorError('No videos found!')          entries = [] -        for durl in doc.findall('./durl'): +        for durl in durls:              size = xpath_text(durl, ['./filesize', './size'])              formats = [{                  'url': durl.find('./url').text,                  'filesize': int_or_none(size), -                'ext': 'flv',              }] -            backup_urls = durl.find('./backup_url') -            if backup_urls is not None: -                for backup_url in backup_urls.findall('./url'): -                    formats.append({'url': backup_url.text}) -            formats.reverse() +            for backup_url in durl.findall('./backup_url/url'): +                formats.append({ +                    'url': backup_url.text, +                    # backup URLs have lower priorities +                    'preference': -2 if 'hd.mp4' in backup_url.text else -3, +                }) + +            self._sort_formats(formats)              entries.append({                  'id': '%s_part%s' % (cid, xpath_text(durl, './order')), -                'title': title,                  'duration': int_or_none(xpath_text(durl, './length'), 1000),                  'formats': formats,              }) +        title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') +        description = self._html_search_meta('description', webpage) +        datetime_str = self._html_search_regex( +            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False) +        timestamp = None +        if datetime_str: +            timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) + +        # TODO 'view_count' requires deobfuscating Javascript          info = {              'id': compat_str(cid),              'title': title, -            'description': view_data.get('description'), -            'thumbnail': view_data.get('pic'), -            'uploader': view_data.get('author'), -            'timestamp': int_or_none(view_data.get('created')), -            'view_count': int_or_none(view_data.get('play')), -            'duration': int_or_none(xpath_text(doc, './timelength')), +            'description': description, +            'timestamp': timestamp, +            'thumbnail': self._html_search_meta('thumbnailUrl', webpage), +            'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000),          } +        uploader_mobj = re.search( +            r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', +            webpage) +        if uploader_mobj: +            info.update({ +                'uploader': uploader_mobj.group('name'), +                'uploader_id': uploader_mobj.group('id'), +            }) + +        for entry in entries: +            entry.update(info) +          if len(entries) == 1: -            entries[0].update(info)              return entries[0]          else: -            info.update({ +            for idx, entry in enumerate(entries): +                entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + +            return {                  '_type': 'multi_video',                  'id': video_id, +                'title': title, +                'description': description,                  'entries': entries, -            }) -            return info +            } diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 000000000..ae4579b33 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BIQLEIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' +    _TESTS = [{ +        'url': 'http://www.biqle.ru/watch/847655_160197695', +        'md5': 'ad5f746a874ccded7b8f211aeea96637', +        'info_dict': { +            'id': '160197695', +            'ext': 'mp4', +            'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', +            'uploader': 'Andrey Rogozin', +            'upload_date': '20110605', +        } +    }, { +        'url': 'https://biqle.org/watch/-44781847_168547604', +        'md5': '7f24e72af1db0edf7c1aaba513174f97', +        'info_dict': { +            'id': '168547604', +            'ext': 'mp4', +            'title': 'Ребенок в шоке от автоматической мойки', +            'uploader': 'Dmitry Kotov', +        } +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        embed_url = self._proto_relative_url(self._search_regex( +            r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url')) + +        return { +            '_type': 'url_transparent', +            'url': embed_url, +        } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 13343bc25..bd538be50 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor):              'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',              'description': 'md5:a8ba0302912d03d246979735c17d2761',          }, +        'params': { +            'format': 'best[format_id^=hds]', +        },      }, {          'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',          'only_matching': True, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f0781fc27..ef560b592 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,9 +307,10 @@ class BrightcoveLegacyIE(InfoExtractor):                                      playlist_title=playlist_info['mediaCollectionDTO']['displayName'])      def _extract_video_info(self, video_info): +        video_id = compat_str(video_info['id'])          publisher_id = video_info.get('publisherId')          info = { -            'id': compat_str(video_info['id']), +            'id': video_id,              'title': video_info['displayName'].strip(),              'description': video_info.get('shortDescription'),              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -331,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor):                      url_comp = compat_urllib_parse_urlparse(url)                      if url_comp.path.endswith('.m3u8'):                          formats.extend( -                            self._extract_m3u8_formats(url, info['id'], 'mp4')) +                            self._extract_m3u8_formats( +                                url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))                          continue                      elif 'akamaihd.net' in url_comp.netloc:                          # This type of renditions are served through @@ -365,7 +367,7 @@ class BrightcoveLegacyIE(InfoExtractor):                      a_format.update({                          'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''),                          'ext': 'mp4', -                        'protocol': 'm3u8', +                        'protocol': 'm3u8_native',                      })                  formats.append(a_format) @@ -395,7 +397,7 @@ class BrightcoveLegacyIE(InfoExtractor):                      return ad_info          if 'url' not in info and not info.get('formats'): -            raise ExtractorError('Unable to extract video url for %s' % info['id']) +            raise ExtractorError('Unable to extract video url for %s' % video_id)          return info @@ -442,6 +444,10 @@ class BrightcoveNewIE(InfoExtractor):          # non numeric ref: prefixed video id          'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',          'only_matching': True, +    }, { +        # unavailable video without message but with error_code +        'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', +        'only_matching': True,      }]      @staticmethod @@ -512,8 +518,9 @@ class BrightcoveNewIE(InfoExtractor):              })          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: -                json_data = self._parse_json(e.cause.read().decode(), video_id) -                raise ExtractorError(json_data[0]['message'], expected=True) +                json_data = self._parse_json(e.cause.read().decode(), video_id)[0] +                raise ExtractorError( +                    json_data.get('message') or json_data['error_code'], expected=True)              raise          title = json_data['name'].strip() @@ -527,7 +534,7 @@ class BrightcoveNewIE(InfoExtractor):                  if not src:                      continue                  formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))              elif source_type == 'application/dash+xml':                  if not src:                      continue diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index dda98059e..3aec601f8 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor):      _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'      _TEST = {          'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', +        'md5': '05850eb8c749e2ee05ad5a1c34668493',          'info_dict': {              'id': 'studio-c-season-5-episode-5',              'ext': 'mp4', @@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        }, +        'add_ie': ['Ooyala'],      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 25b2d4efe..61463f249 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse  from ..utils import (      ExtractorError,      HEADRequest,      unified_strdate, -    url_basename,      qualities,      int_or_none,  ) @@ -16,24 +16,38 @@ from ..utils import (  class CanalplusIE(InfoExtractor):      IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' -    _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' +    _VALID_URL = r'''(?x) +                        https?:// +                            (?: +                                (?: +                                    (?:(?:www|m)\.)?canalplus\.fr| +                                    (?:www\.)?piwiplus\.fr| +                                    (?:www\.)?d8\.tv| +                                    (?:www\.)?d17\.tv| +                                    (?:www\.)?itele\.fr +                                )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| +                                player\.canalplus\.fr/#/(?P<id>\d+) +                            ) + +                    '''      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'      _SITE_ID_MAP = { -        'canalplus.fr': 'cplus', -        'piwiplus.fr': 'teletoon', -        'd8.tv': 'd8', -        'itele.fr': 'itele', +        'canalplus': 'cplus', +        'piwiplus': 'teletoon', +        'd8': 'd8', +        'd17': 'd17', +        'itele': 'itele',      }      _TESTS = [{ -        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', -        'md5': '12164a6f14ff6df8bd628e8ba9b10b78', +        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', +        'md5': '41f438a4904f7664b91b4ed0dec969dc',          'info_dict': { -            'id': '1263092', +            'id': '1192814',              'ext': 'mp4', -            'title': 'Le Zapping - 13/05/15', -            'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', -            'upload_date': '20150513', +            'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014", +            'description': "Toute l'année 2014 dans un Zapping exceptionnel !", +            'upload_date': '20150105',          },      }, {          'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', @@ -46,35 +60,45 @@ class CanalplusIE(InfoExtractor):          },          'skip': 'Only works from France',      }, { -        'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', +        'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',          'info_dict': { -            'id': '966289', -            'ext': 'flv', -            'title': 'Campagne intime - Documentaire exceptionnel', -            'description': 'md5:d2643b799fb190846ae09c61e59a859f', -            'upload_date': '20131108', +            'id': '1390231', +            'ext': 'mp4', +            'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité", +            'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6', +            'upload_date': '20160512', +        }, +        'params': { +            'skip_download': True,          }, -        'skip': 'videos get deleted after a while',      }, { -        'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', -        'md5': '38b8f7934def74f0d6f3ba6c036a5f82', +        'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',          'info_dict': { -            'id': '1213714', +            'id': '1398334',              'ext': 'mp4', -            'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45', -            'description': 'md5:8216206ec53426ea6321321f3b3c16db', -            'upload_date': '20150211', +            'title': "L'invité de Bruce Toussaint du 07/06/2016 - ", +            'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324', +            'upload_date': '20160607',          }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://m.canalplus.fr/?vid=1398231', +        'only_matching': True, +    }, { +        'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', +        'only_matching': True,      }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.groupdict().get('id') +        video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid') -        site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal'] +        site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]          # Beware, some subclasses do not define an id group -        display_id = url_basename(mobj.group('path')) +        display_id = mobj.group('display_id') or video_id          if video_id is None:              webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 68a0633b6..ff663d079 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,65 +4,66 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( +    js_to_json, +    smuggle_url, +)  class CBCIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'      _TESTS = [{          # with mediaId          'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', +        'md5': '97e24d09672fc4cf56256d6faa6c25bc',          'info_dict': {              'id': '2682904050', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Don Cherry – All-Stars',              'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', -            'timestamp': 1454475540, +            'timestamp': 1454463000,              'upload_date': '20160203', -        }, -        'params': { -            # rtmp download -            'skip_download': True, +            'uploader': 'CBCC-NEW',          },      }, {          # with clipId          'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', +        'md5': '0274a90b51a9b4971fe005c63f592f12',          'info_dict': {              'id': '2487345465', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Robin Williams freestyles on 90 Minutes Live',              'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', -            'upload_date': '19700101', +            'upload_date': '19780210',              'uploader': 'CBCC-NEW', -        }, -        'params': { -            # rtmp download -            'skip_download': True, +            'timestamp': 255977160,          },      }, {          # multiple iframes          'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',          'playlist': [{ +            'md5': '377572d0b49c4ce0c9ad77470e0b96b4',              'info_dict': {                  'id': '2680832926', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'An Eagle\'s-Eye View Off Burrard Bridge',                  'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', -                'upload_date': '19700101', +                'upload_date': '20160201', +                'timestamp': 1454342820, +                'uploader': 'CBCC-NEW',              },          }, { +            'md5': '415a0e3f586113894174dfb31aa5bb1a',              'info_dict': {                  'id': '2658915080', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Fly like an eagle!',                  'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', -                'upload_date': '19700101', +                'upload_date': '20150315', +                'timestamp': 1426443984, +                'uploader': 'CBCC-NEW',              },          }], -        'params': { -            # rtmp download -            'skip_download': True, -        },      }]      @classmethod @@ -91,24 +92,54 @@ class CBCIE(InfoExtractor):  class CBCPlayerIE(InfoExtractor):      _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.cbc.ca/player/play/2683190193', +        'md5': '64d25f841ddf4ddb28a235338af32e2c',          'info_dict': {              'id': '2683190193', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Gerry Runs a Sweat Shop',              'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', -            'timestamp': 1455067800, +            'timestamp': 1455071400,              'upload_date': '20160210', +            'uploader': 'CBCC-NEW',          }, -        'params': { -            # rtmp download -            'skip_download': True, +    }, { +        # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ +        'url': 'http://www.cbc.ca/player/play/2657631896', +        'md5': 'e5e708c34ae6fca156aafe17c43e8b75', +        'info_dict': { +            'id': '2657631896', +            'ext': 'mp3', +            'title': 'CBC Montreal is organizing its first ever community hackathon!', +            'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', +            'timestamp': 1425704400, +            'upload_date': '20150307', +            'uploader': 'CBCC-NEW',          }, -    } +    }, { +        # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url +        'url': 'http://www.cbc.ca/player/play/2164402062', +        'md5': '17a61eb813539abea40618d6323a7f82', +        'info_dict': { +            'id': '2164402062', +            'ext': 'flv', +            'title': 'Cancer survivor four times over', +            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', +            'timestamp': 1320410746, +            'upload_date': '20111104', +            'uploader': 'CBCC-NEW', +        }, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        return self.url_result( -            'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, -            'ThePlatformFeed', video_id) +        return { +            '_type': 'url_transparent', +            'ie_key': 'ThePlatform', +            'url': smuggle_url( +                'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { +                    'force_smil_url': True +                }), +            'id': video_id, +        } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 051d783a2..ac2c7dced 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,5 +1,7 @@  from __future__ import unicode_literals +import re +  from .theplatform import ThePlatformIE  from ..utils import (      xpath_text, @@ -21,7 +23,7 @@ class CBSBaseIE(ThePlatformIE):  class CBSIE(CBSBaseIE): -    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' +    _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))'      _TESTS = [{          'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -66,11 +68,12 @@ class CBSIE(CBSBaseIE):      TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'      def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) -        content_id = self._search_regex( -            [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], -            webpage, 'content id') +        content_id, display_id = re.match(self._VALID_URL, url).groups() +        if not content_id: +            webpage = self._download_webpage(url, display_id) +            content_id = self._search_regex( +                [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], +                webpage, 'content id')          items_data = self._download_xml(              'http://can.cbs.com/thunder/player/videoPlayerService.php',              content_id, query={'partner': 'cbs', 'contentId': content_id}) diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py new file mode 100644 index 000000000..74adb38a6 --- /dev/null +++ b/youtube_dl/extractor/cbslocal.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import calendar +import datetime + +from .anvato import AnvatoIE +from .sendtonews import SendtoNewsIE +from ..compat import compat_urlparse + + +class CBSLocalIE(AnvatoIE): +    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' + +    _TESTS = [{ +        # Anvato backend +        'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', +        'md5': 'f0ee3081e3843f575fccef901199b212', +        'info_dict': { +            'id': '3401037', +            'ext': 'mp4', +            'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', +            'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', +            'thumbnail': 're:^https?://.*', +            'timestamp': 1463440500, +            'upload_date': '20160516', +            'subtitles': { +                'en': 'mincount:5', +            }, +            'categories': [ +                'Stations\\Spoken Word\\KCBSTV', +                'Syndication\\MSN', +                'Syndication\\NDN', +                'Syndication\\AOL', +                'Syndication\\Yahoo', +                'Syndication\\Tribune', +                'Syndication\\Curb.tv', +                'Content\\News' +            ], +        }, +    }, { +        # SendtoNews embed +        'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', +        'info_dict': { +            'id': 'GxfCe0Zo7D-175909-5588', +            'ext': 'mp4', +            'title': 'Recap: CLE 15, CIN 6', +            'description': '5/16/16: Indians\' bats explode for 15 runs in a win', +            'upload_date': '20160516', +            'timestamp': 1463433840, +            'duration': 49, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        sendtonews_url = SendtoNewsIE._extract_url(webpage) +        if sendtonews_url: +            info_dict = { +                '_type': 'url_transparent', +                'url': compat_urlparse.urljoin(url, sendtonews_url), +            } +        else: +            info_dict = self._extract_anvato_videos(webpage, display_id) + +        time_str = self._html_search_regex( +            r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) +        timestamp = None +        if time_str: +            timestamp = calendar.timegm(datetime.datetime.strptime( +                time_str, '%b %d, %Y %I:%M %p').timetuple()) + +        info_dict.update({ +            'display_id': display_id, +            'timestamp': timestamp, +        }) + +        return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6652c8e42..5a58d1777 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -33,20 +33,34 @@ class CeskaTelevizeIE(InfoExtractor):              'skip_download': True,          },      }, { -        'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', +        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',          'info_dict': { -            'id': '61924494876844374', +            'id': '61924494877028507',              'ext': 'mp4', -            'title': 'První republika: Zpěvačka z Dupárny Bobina', -            'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', +            'title': 'Hyde Park Civilizace: Bonus 01 - En', +            'description': 'English Subtittles',              'thumbnail': 're:^https?://.*\.jpg', -            'duration': 88.4, +            'duration': 81.3,          },          'params': {              # m3u8 download              'skip_download': True,          },      }, { +        # live stream +        'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', +        'info_dict': { +            'id': 402, +            'ext': 'mp4', +            'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', +            'is_live': True, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +        'skip': 'Georestricted to Czech Republic', +    }, {          # video with 18+ caution trailer          'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',          'info_dict': { @@ -118,19 +132,21 @@ class CeskaTelevizeIE(InfoExtractor):          req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))          req.add_header('Referer', url) -        playlist_title = self._og_search_title(webpage) -        playlist_description = self._og_search_description(webpage) +        playlist_title = self._og_search_title(webpage, default=None) +        playlist_description = self._og_search_description(webpage, default=None)          playlist = self._download_json(req, playlist_id)['playlist']          playlist_len = len(playlist)          entries = []          for item in playlist: +            is_live = item.get('type') == 'LIVE'              formats = []              for format_id, stream_url in item['streamUrls'].items():                  formats.extend(self._extract_m3u8_formats(                      stream_url, playlist_id, 'mp4', -                    entry_protocol='m3u8_native', fatal=False)) +                    entry_protocol='m3u8' if is_live else 'm3u8_native', +                    fatal=False))              self._sort_formats(formats)              item_id = item.get('id') or item['assetId'] @@ -145,14 +161,22 @@ class CeskaTelevizeIE(InfoExtractor):                  if subs:                      subtitles = self.extract_subtitles(episode_id, subs) +            if playlist_len == 1: +                final_title = playlist_title or title +                if is_live: +                    final_title = self._live_title(final_title) +            else: +                final_title = '%s (%s)' % (playlist_title, title) +              entries.append({                  'id': item_id, -                'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), +                'title': final_title,                  'description': playlist_description if playlist_len == 1 else None,                  'thumbnail': thumbnail,                  'duration': duration,                  'formats': formats,                  'subtitles': subtitles, +                'is_live': is_live,              })          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index c74553dcf..34d4e6156 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -20,54 +20,64 @@ class Channel9IE(InfoExtractor):      '''      IE_DESC = 'Channel 9'      IE_NAME = 'channel9' -    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' - -    _TESTS = [ -        { -            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', -            'md5': 'bbd75296ba47916b754e73c3a4bbdf10', -            'info_dict': { -                'id': 'Events/TechEd/Australia/2013/KOS002', -                'ext': 'mp4', -                'title': 'Developer Kick-Off Session: Stuff We Love', -                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', -                'duration': 4576, -                'thumbnail': 're:http://.*\.jpg', -                'session_code': 'KOS002', -                'session_day': 'Day 1', -                'session_room': 'Arena 1A', -                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], -            }, +    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + +    _TESTS = [{ +        'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', +        'md5': 'bbd75296ba47916b754e73c3a4bbdf10', +        'info_dict': { +            'id': 'Events/TechEd/Australia/2013/KOS002', +            'ext': 'mp4', +            'title': 'Developer Kick-Off Session: Stuff We Love', +            'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', +            'duration': 4576, +            'thumbnail': 're:http://.*\.jpg', +            'session_code': 'KOS002', +            'session_day': 'Day 1', +            'session_room': 'Arena 1A', +            'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', +                                 'Mads Kristensen'],          }, -        { -            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', -            'md5': 'b43ee4529d111bc37ba7ee4f34813e68', -            'info_dict': { -                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', -                'ext': 'mp4', -                'title': 'Self-service BI with Power BI - nuclear testing', -                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', -                'duration': 1540, -                'thumbnail': 're:http://.*\.jpg', -                'authors': ['Mike Wilmot'], -            }, +    }, { +        'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', +        'md5': 'b43ee4529d111bc37ba7ee4f34813e68', +        'info_dict': { +            'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', +            'ext': 'mp4', +            'title': 'Self-service BI with Power BI - nuclear testing', +            'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', +            'duration': 1540, +            'thumbnail': 're:http://.*\.jpg', +            'authors': ['Mike Wilmot'],          }, -        { -            # low quality mp4 is best -            'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', -            'info_dict': { -                'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', -                'ext': 'mp4', -                'title': 'Ranges for the Standard Library', -                'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', -                'duration': 5646, -                'thumbnail': 're:http://.*\.jpg', -            }, -            'params': { -                'skip_download': True, -            }, -        } -    ] +    }, { +        # low quality mp4 is best +        'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', +        'info_dict': { +            'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', +            'ext': 'mp4', +            'title': 'Ranges for the Standard Library', +            'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', +            'duration': 5646, +            'thumbnail': 're:http://.*\.jpg', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', +        'info_dict': { +            'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', +            'title': 'Channel 9', +        }, +        'playlist_count': 2, +    }, { +        'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', +        'only_matching': True, +    }, { +        'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', +        'only_matching': True, +    }]      _RSS_URL = 'http://channel9.msdn.com/%s/RSS' @@ -254,22 +264,30 @@ class Channel9IE(InfoExtractor):          return self.playlist_result(contents) -    def _extract_list(self, content_path): -        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS') +    def _extract_list(self, video_id, rss_url=None): +        if not rss_url: +            rss_url = self._RSS_URL % video_id +        rss = self._download_xml(rss_url, video_id, 'Downloading RSS')          entries = [self.url_result(session_url.text, 'Channel9')                     for session_url in rss.findall('./channel/item/link')]          title_text = rss.find('./channel/title').text -        return self.playlist_result(entries, content_path, title_text) +        return self.playlist_result(entries, video_id, title_text)      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          content_path = mobj.group('contentpath') +        rss = mobj.group('rss') + +        if rss: +            return self._extract_list(content_path, url) -        webpage = self._download_webpage(url, content_path, 'Downloading web page') +        webpage = self._download_webpage( +            url, content_path, 'Downloading web page') -        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage) -        if page_type_m is not None: -            page_type = page_type_m.group('pagetype') +        page_type = self._search_regex( +            r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2', +            webpage, 'page type', default=None, group='pagetype') +        if page_type:              if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content                  return self._extract_entry_item(webpage, content_path)              elif page_type == 'Session':  # Event session page, may contain downloadable content @@ -278,6 +296,5 @@ class Channel9IE(InfoExtractor):                  return self._extract_list(content_path)              else:                  raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) -          else:  # Assuming list              return self._extract_list(content_path) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py deleted file mode 100644 index 042c4f2f1..000000000 --- a/youtube_dl/extractor/cinemassacre.py +++ /dev/null @@ -1,119 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError -from .screenwavemedia import ScreenwaveMediaIE - - -class CinemassacreIE(InfoExtractor): -    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' -    _TESTS = [ -        { -            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', -            'md5': 'fde81fbafaee331785f58cd6c0d46190', -            'info_dict': { -                'id': 'Cinemassacre-19911', -                'ext': 'mp4', -                'upload_date': '20121110', -                'title': '“Angry Video Game Nerd: The Movie” – Trailer', -                'description': 'md5:fb87405fcb42a331742a0dce2708560b', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', -            'md5': 'd72f10cd39eac4215048f62ab477a511', -            'info_dict': { -                'id': 'Cinemassacre-521be8ef82b16', -                'ext': 'mp4', -                'upload_date': '20131002', -                'title': 'The Mummy’s Hand (1940)', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            # Youtube embedded video -            'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', -            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', -            'info_dict': { -                'id': 'OEVzPCY2T-g', -                'ext': 'webm', -                'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', -                'upload_date': '20061207', -                'uploader': 'Cinemassacre', -                'uploader_id': 'JamesNintendoNerd', -                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', -            } -        }, -        { -            # Youtube embedded video -            'url': 'http://cinemassacre.com/2006/09/01/mckids/', -            'md5': '7393c4e0f54602ad110c793eb7a6513a', -            'info_dict': { -                'id': 'FnxsNhuikpo', -                'ext': 'webm', -                'upload_date': '20060901', -                'uploader': 'Cinemassacre Extra', -                'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', -                'uploader_id': 'Cinemassacre', -                'title': 'AVGN: McKids', -            } -        }, -        { -            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', -            'md5': '1376908e49572389e7b06251a53cdd08', -            'info_dict': { -                'id': 'Cinemassacre-555779690c440', -                'ext': 'mp4', -                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', -                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', -                'upload_date': '20150525', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        } -    ] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('display_id') -        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') - -        webpage = self._download_webpage(url, display_id) - -        playerdata_url = self._search_regex( -            [ -                ScreenwaveMediaIE.EMBED_PATTERN, -                r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', -            ], -            webpage, 'player data URL', default=None, group='url') -        if not playerdata_url: -            raise ExtractorError('Unable to find player data') - -        video_title = self._html_search_regex( -            r'<title>(?P<title>.+?)\|', webpage, 'title') -        video_description = self._html_search_regex( -            r'<div class="entry-content">(?P<description>.+?)</div>', -            webpage, 'description', flags=re.DOTALL, fatal=False) -        video_thumbnail = self._og_search_thumbnail(webpage) - -        return { -            '_type': 'url_transparent', -            'display_id': display_id, -            'title': video_title, -            'description': video_description, -            'upload_date': video_date, -            'thumbnail': video_thumbnail, -            'url': playerdata_url, -        } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py deleted file mode 100644 index 002b24037..000000000 --- a/youtube_dl/extractor/collegehumor.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import unicode_literals - -import json -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class CollegeHumorIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' - -    _TESTS = [ -        { -            'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', -            'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd', -            'info_dict': { -                'id': '6902724', -                'ext': 'mp4', -                'title': 'Comic-Con Cosplay Catastrophe', -                'description': "Fans get creative this year at San Diego.  Too creative.  And yes, that's really Joss Whedon.", -                'age_limit': 13, -                'duration': 187, -            }, -        }, { -            'url': 'http://www.collegehumor.com/video/3505939/font-conference', -            'md5': '72fa701d8ef38664a4dbb9e2ab721816', -            'info_dict': { -                'id': '3505939', -                'ext': 'mp4', -                'title': 'Font Conference', -                'description': "This video wasn't long enough, so we made it double-spaced.", -                'age_limit': 10, -                'duration': 179, -            }, -        }, { -            # embedded youtube video -            'url': 'http://www.collegehumor.com/embed/6950306', -            'info_dict': { -                'id': 'Z-bao9fg6Yc', -                'ext': 'mp4', -                'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!', -                'uploader': 'Mark Dice', -                'uploader_id': 'MarkDice', -                'description': 'md5:62c3dab9351fac7bb44b53b69511d87f', -                'upload_date': '20140127', -            }, -            'params': { -                'skip_download': True, -            }, -            'add_ie': ['Youtube'], -        }, -    ] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('videoid') - -        jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json' -        data = json.loads(self._download_webpage( -            jsonUrl, video_id, 'Downloading info JSON')) -        vdata = data['video'] -        if vdata.get('youtubeId') is not None: -            return { -                '_type': 'url', -                'url': vdata['youtubeId'], -                'ie_key': 'Youtube', -            } - -        AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} -        rating = vdata.get('rating') -        if rating: -            age_limit = AGE_LIMITS.get(rating.lower()) -        else: -            age_limit = None  # None = No idea - -        PREFS = {'high_quality': 2, 'low_quality': 0} -        formats = [] -        for format_key in ('mp4', 'webm'): -            for qname, qurl in vdata.get(format_key, {}).items(): -                formats.append({ -                    'format_id': format_key + '_' + qname, -                    'url': qurl, -                    'format': format_key, -                    'preference': PREFS.get(qname), -                }) -        self._sort_formats(formats) - -        duration = int_or_none(vdata.get('duration'), 1000) -        like_count = int_or_none(vdata.get('likes')) - -        return { -            'id': video_id, -            'title': vdata['title'], -            'description': vdata.get('description'), -            'thumbnail': vdata.get('thumbnail'), -            'formats': formats, -            'age_limit': age_limit, -            'duration': duration, -            'like_count': like_count, -        } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 0c59102e0..2b6aaa3aa 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -44,10 +44,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):      #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524      _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)                        |https?://(:www\.)? -                          (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ +                          (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/                           ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|                            (?P<clip> -                              (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) +                              (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))                                |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))                                |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))                            )| @@ -129,6 +129,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):      }, {          'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',          'only_matching': True, +    }, { +        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', +        'only_matching': True,      }]      _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0843d89af..bfd432160 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -45,6 +45,7 @@ from ..utils import (      unescapeHTML,      unified_strdate,      url_basename, +    xpath_element,      xpath_text,      xpath_with_ns,      determine_protocol, @@ -987,7 +988,7 @@ class InfoExtractor(object):      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,                               transform_source=lambda s: fix_xml_ampersands(s).strip(), -                             fatal=True): +                             fatal=True, m3u8_id=None):          manifest = self._download_xml(              manifest_url, video_id, 'Downloading f4m manifest',              'Unable to download f4m manifest', @@ -1001,11 +1002,11 @@ class InfoExtractor(object):          return self._parse_f4m_formats(              manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, -            transform_source=transform_source, fatal=fatal) +            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,                             transform_source=lambda s: fix_xml_ampersands(s).strip(), -                           fatal=True): +                           fatal=True, m3u8_id=None):          # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy          akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')          if akamai_pv is not None and ';' in akamai_pv.text: @@ -1029,9 +1030,26 @@ class InfoExtractor(object):              'base URL', default=None)          if base_url:              base_url = base_url.strip() + +        bootstrap_info = xpath_element( +            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], +            'bootstrap info', default=None) +          for i, media_el in enumerate(media_nodes): -            if manifest_version == '2.0': -                media_url = media_el.attrib.get('href') or media_el.attrib.get('url') +            tbr = int_or_none(media_el.attrib.get('bitrate')) +            width = int_or_none(media_el.attrib.get('width')) +            height = int_or_none(media_el.attrib.get('height')) +            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) +            # If <bootstrapInfo> is present, the specified f4m is a +            # stream-level manifest, and only set-level manifests may refer to +            # external resources.  See section 11.4 and section 4 of F4M spec +            if bootstrap_info is None: +                media_url = None +                # @href is introduced in 2.0, see section 11.6 of F4M spec +                if manifest_version == '2.0': +                    media_url = media_el.attrib.get('href') +                if media_url is None: +                    media_url = media_el.attrib.get('url')                  if not media_url:                      continue                  manifest_url = ( @@ -1041,29 +1059,43 @@ class InfoExtractor(object):                  # since bitrates in parent manifest (this one) and media_url manifest                  # may differ leading to inability to resolve the format by requested                  # bitrate in f4m downloader -                if determine_ext(manifest_url) == 'f4m': -                    formats.extend(self._extract_f4m_formats( +                ext = determine_ext(manifest_url) +                if ext == 'f4m': +                    f4m_formats = self._extract_f4m_formats(                          manifest_url, video_id, preference=preference, f4m_id=f4m_id, -                        transform_source=transform_source, fatal=fatal)) +                        transform_source=transform_source, fatal=fatal) +                    # Sometimes stream-level manifest contains single media entry that +                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). +                    # At the same time parent's media entry in set-level manifest may +                    # contain it. We will copy it from parent in such cases. +                    if len(f4m_formats) == 1: +                        f = f4m_formats[0] +                        f.update({ +                            'tbr': f.get('tbr') or tbr, +                            'width': f.get('width') or width, +                            'height': f.get('height') or height, +                            'format_id': f.get('format_id') if not tbr else format_id, +                        }) +                    formats.extend(f4m_formats) +                    continue +                elif ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        manifest_url, video_id, 'mp4', preference=preference, +                        m3u8_id=m3u8_id, fatal=fatal))                      continue -            tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({ -                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), +                'format_id': format_id,                  'url': manifest_url, -                'ext': 'flv', +                'ext': 'flv' if bootstrap_info is not None else None,                  'tbr': tbr, -                'width': int_or_none(media_el.attrib.get('width')), -                'height': int_or_none(media_el.attrib.get('height')), +                'width': width, +                'height': height,                  'preference': preference,              })          return formats -    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, -                              entry_protocol='m3u8', preference=None, -                              m3u8_id=None, note=None, errnote=None, -                              fatal=True, live=False): - -        formats = [{ +    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): +        return {              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),              'url': m3u8_url,              'ext': ext, @@ -1071,7 +1103,14 @@ class InfoExtractor(object):              'preference': preference - 1 if preference else -1,              'resolution': 'multiple',              'format_note': 'Quality selection URL', -        }] +        } + +    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, +                              entry_protocol='m3u8', preference=None, +                              m3u8_id=None, note=None, errnote=None, +                              fatal=True, live=False): + +        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]          format_url = lambda u: (              u @@ -1138,12 +1177,15 @@ class InfoExtractor(object):                  format_id = []                  if m3u8_id:                      format_id.append(m3u8_id) -                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None +                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None +                # Despite specification does not mention NAME attribute for +                # EXT-X-STREAM-INF it still sometimes may be present +                stream_name = last_info.get('NAME') or last_media_name                  # Bandwidth of live streams may differ over time thus making                  # format_id unpredictable. So it's better to keep provided                  # format_id intact.                  if not live: -                    format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) +                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))                  f = {                      'format_id': '-'.join(format_id),                      'url': format_url(line.strip()), @@ -1275,21 +1317,21 @@ class InfoExtractor(object):          m3u8_count = 0          srcs = [] -        videos = smil.findall(self._xpath_ns('.//video', namespace)) -        for video in videos: -            src = video.get('src') +        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) +        for medium in media: +            src = medium.get('src')              if not src or src in srcs:                  continue              srcs.append(src) -            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) -            filesize = int_or_none(video.get('size') or video.get('fileSize')) -            width = int_or_none(video.get('width')) -            height = int_or_none(video.get('height')) -            proto = video.get('proto') -            ext = video.get('ext') +            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) +            filesize = int_or_none(medium.get('size') or medium.get('fileSize')) +            width = int_or_none(medium.get('width')) +            height = int_or_none(medium.get('height')) +            proto = medium.get('proto') +            ext = medium.get('ext')              src_ext = determine_ext(src) -            streamer = video.get('streamer') or base +            streamer = medium.get('streamer') or base              if proto == 'rtmp' or streamer.startswith('rtmp'):                  rtmp_count += 1 diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py new file mode 100644 index 000000000..a901b8d22 --- /dev/null +++ b/youtube_dl/extractor/coub.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    float_or_none, +    int_or_none, +    parse_iso8601, +    qualities, +) + + +class CoubIE(InfoExtractor): +    _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)' + +    _TESTS = [{ +        'url': 'http://coub.com/view/5u5n1', +        'info_dict': { +            'id': '5u5n1', +            'ext': 'mp4', +            'title': 'The Matrix Moonwalk', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 4.6, +            'timestamp': 1428527772, +            'upload_date': '20150408', +            'uploader': 'Артём Лоскутников', +            'uploader_id': 'artyom.loskutnikov', +            'view_count': int, +            'like_count': int, +            'repost_count': int, +            'comment_count': int, +            'age_limit': 0, +        }, +    }, { +        'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4', +        'only_matching': True, +    }, { +        'url': 'coub:5u5n1', +        'only_matching': True, +    }, { +        # longer video id +        'url': 'http://coub.com/view/237d5l5h', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        coub = self._download_json( +            'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id) + +        if coub.get('error'): +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, coub['error']), expected=True) + +        title = coub['title'] + +        file_versions = coub['file_versions'] + +        QUALITIES = ('low', 'med', 'high') + +        MOBILE = 'mobile' +        IPHONE = 'iphone' +        HTML5 = 'html5' + +        SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5) + +        quality_key = qualities(QUALITIES) +        preference_key = qualities(SOURCE_PREFERENCE) + +        formats = [] + +        for kind, items in file_versions.get(HTML5, {}).items(): +            if kind not in ('video', 'audio'): +                continue +            if not isinstance(items, dict): +                continue +            for quality, item in items.items(): +                if not isinstance(item, dict): +                    continue +                item_url = item.get('url') +                if not item_url: +                    continue +                formats.append({ +                    'url': item_url, +                    'format_id': '%s-%s-%s' % (HTML5, kind, quality), +                    'filesize': int_or_none(item.get('size')), +                    'vcodec': 'none' if kind == 'audio' else None, +                    'quality': quality_key(quality), +                    'preference': preference_key(HTML5), +                }) + +        iphone_url = file_versions.get(IPHONE, {}).get('url') +        if iphone_url: +            formats.append({ +                'url': iphone_url, +                'format_id': IPHONE, +                'preference': preference_key(IPHONE), +            }) + +        mobile_url = file_versions.get(MOBILE, {}).get('audio_url') +        if mobile_url: +            formats.append({ +                'url': mobile_url, +                'format_id': '%s-audio' % MOBILE, +                'preference': preference_key(MOBILE), +            }) + +        self._sort_formats(formats) + +        thumbnail = coub.get('picture') +        duration = float_or_none(coub.get('duration')) +        timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) +        uploader = coub.get('channel', {}).get('title') +        uploader_id = coub.get('channel', {}).get('permalink') + +        view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) +        like_count = int_or_none(coub.get('likes_count')) +        repost_count = int_or_none(coub.get('recoubs_count')) +        comment_count = int_or_none(coub.get('comments_count')) + +        age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) +        if age_restricted is not None: +            age_limit = 18 if age_restricted is True else 0 +        else: +            age_limit = None + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'view_count': view_count, +            'like_count': like_count, +            'repost_count': repost_count, +            'comment_count': comment_count, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py new file mode 100644 index 000000000..b60a1d813 --- /dev/null +++ b/youtube_dl/extractor/dailymail.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    determine_protocol, +) + + +class DailyMailIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', +        'md5': '2f639d446394f53f3a33658b518b6615', +        'info_dict': { +            'id': '1288527', +            'ext': 'mp4', +            'title': 'Turn any video into an impressionist masterpiece', +            'description': 'md5:88ddbcb504367987b2708bb38677c9d2', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        video_data = self._parse_json(self._search_regex( +            r"data-opts='({.+?})'", webpage, 'video data'), video_id) +        title = video_data['title'] +        video_sources = self._download_json(video_data.get( +            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + +        formats = [] +        for rendition in video_sources['renditions']: +            rendition_url = rendition.get('url') +            if not rendition_url: +                continue +            tbr = int_or_none(rendition.get('encodingRate'), 1000) +            container = rendition.get('videoContainer') +            is_hls = container == 'M2TS' +            protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) +            formats.append({ +                'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), +                'url': rendition_url, +                'width': int_or_none(rendition.get('frameWidth')), +                'height': int_or_none(rendition.get('frameHeight')), +                'tbr': tbr, +                'vcodec': rendition.get('videoCodec'), +                'container': container, +                'protocol': protocol, +                'ext': 'mp4' if is_hls else None, +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': video_data.get('descr'), +            'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py index ae7c571bd..0f0f0b8d3 100644 --- a/youtube_dl/extractor/dw.py +++ b/youtube_dl/extractor/dw.py @@ -2,13 +2,16 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    unified_strdate, +)  from ..compat import compat_urlparse  class DWIE(InfoExtractor):      IE_NAME = 'dw' -    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'      _TESTS = [{          # video          'url': 'http://www.dw.com/en/intelligent-light/av-19112290', @@ -31,6 +34,16 @@ class DWIE(InfoExtractor):              'description': 'md5:bc9ca6e4e063361e21c920c53af12405',              'upload_date': '20160311',          } +    }, { +        'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798', +        'md5': '56b6214ef463bfb9a3b71aeb886f3cf1', +        'info_dict': { +            'id': '19274438', +            'ext': 'mp4', +            'title': 'Welcome to the 90s – Hip Hop', +            'description': 'Welcome to the 90s - The Golden Decade of Hip Hop', +            'upload_date': '20160521', +        },      }]      def _real_extract(self, url): @@ -38,6 +51,7 @@ class DWIE(InfoExtractor):          webpage = self._download_webpage(url, media_id)          hidden_inputs = self._hidden_inputs(webpage)          title = hidden_inputs['media_title'] +        media_id = hidden_inputs.get('media_id') or media_id          if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':              formats = self._extract_smil_formats( @@ -49,13 +63,20 @@ class DWIE(InfoExtractor):          else:              formats = [{'url': hidden_inputs['file_name']}] +        upload_date = hidden_inputs.get('display_date') +        if not upload_date: +            upload_date = self._html_search_regex( +                r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage, +                'upload date', default=None) +            upload_date = unified_strdate(upload_date) +          return {              'id': media_id,              'title': title,              'description': self._og_search_description(webpage),              'thumbnail': hidden_inputs.get('preview_image'),              'duration': int_or_none(hidden_inputs.get('file_duration')), -            'upload_date': hidden_inputs.get('display_date'), +            'upload_date': upload_date,              'formats': formats,          } diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index e006921ec..ac5d0fe24 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -11,8 +11,8 @@ from ..utils import (  class EpornerIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)' +    _TESTS = [{          'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',          'md5': '39d486f046212d8e1b911c52ab4691f8',          'info_dict': { @@ -23,8 +23,12 @@ class EpornerIE(InfoExtractor):              'duration': 1838,              'view_count': int,              'age_limit': 18, -        } -    } +        }, +    }, { +        # New (May 2016) URL layout +        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', +        'only_matching': True, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index db4b263bc..66c08bec4 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor):      _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'      _TESTS = [{          'url': 'http://espn.go.com/video/clip?id=10365079', +        'md5': '60e5d097a523e767d06479335d1bdc58',          'info_dict': {              'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',              'ext': 'mp4', @@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor):              'description': None,          },          'params': { -            # m3u8 download              'skip_download': True,          }, +        'add_ie': ['OoyalaExternal'],      }, {          # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season          'url': 'http://espn.go.com/video/clip?id=2743663', +        'md5': 'f4ac89b59afc7e2d7dbb049523df6768',          'info_dict': {              'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',              'ext': 'mp4',              'title': 'Must-See Moments: Best of the MLS season',          },          'params': { -            # m3u8 download              'skip_download': True,          }, +        'add_ie': ['OoyalaExternal'],      }, {          'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',          'only_matching': True, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1f95530a5..aa98782a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals  from .abc import ABCIE  from .abc7news import Abc7NewsIE +from .abcnews import ( +    AbcNewsIE, +    AbcNewsVideoIE, +)  from .academicearth import AcademicEarthCourseIE  from .acast import (      ACastIE, @@ -53,6 +57,7 @@ from .arte import (      ArteTVDDCIE,      ArteTVMagazineIE,      ArteTVEmbedIE, +    ArteTVPlaylistIE,  )  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE @@ -76,6 +81,7 @@ from .bigflix import BigflixIE  from .bild import BildIE  from .bilibili import BiliBiliIE  from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE  from .bleacherreport import (      BleacherReportIE,      BleacherReportCMSIE, @@ -107,6 +113,7 @@ from .cbc import (      CBCPlayerIE,  )  from .cbs import CBSIE +from .cbslocal import CBSLocalIE  from .cbsinteractive import CBSInteractiveIE  from .cbsnews import (      CBSNewsIE, @@ -124,7 +131,6 @@ from .chirbit import (      ChirbitProfileIE,  )  from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE  from .cliprs import ClipRsIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE @@ -139,7 +145,7 @@ from .cnn import (      CNNBlogsIE,      CNNArticleIE,  ) -from .collegehumor import CollegeHumorIE +from .coub import CoubIE  from .collegerama import CollegeRamaIE  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE  from .comcarcoff import ComCarCoffIE @@ -158,6 +164,7 @@ from .cspan import CSpanIE  from .ctsnews import CtsNewsIE  from .cultureunplugged import CultureUnpluggedIE  from .cwtv import CWTVIE +from .dailymail import DailyMailIE  from .dailymotion import (      DailymotionIE,      DailymotionPlaylistIE, @@ -227,6 +234,7 @@ from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE  from .expotv import ExpoTVIE  from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE  from .facebook import FacebookIE  from .faz import FazIE  from .fc2 import FC2IE @@ -239,6 +247,7 @@ from .fktv import FKTVIE  from .flickr import FlickrIE  from .folketinget import FolketingetIE  from .footyroom import FootyRoomIE +from .formula1 import Formula1IE  from .fourtube import FourTubeIE  from .fox import FOXIE  from .foxgay import FoxgayIE @@ -366,6 +375,7 @@ from .kuwo import (  )  from .la7 import LA7IE  from .laola1tv import Laola1TvIE +from .learnr import LearnrIE  from .lecture2go import Lecture2GoIE  from .lemonde import LemondeIE  from .leeco import ( @@ -373,6 +383,7 @@ from .leeco import (      LePlaylistIE,      LetvCloudIE,  ) +from .libraryofcongress import LibraryOfCongressIE  from .libsyn import LibsynIE  from .lifenews import (      LifeNewsIE, @@ -383,6 +394,7 @@ from .limelight import (      LimelightChannelIE,      LimelightChannelListIE,  ) +from .litv import LiTVIE  from .liveleak import LiveLeakIE  from .livestream import (      LivestreamIE, @@ -390,6 +402,7 @@ from .livestream import (      LivestreamShortenerIE,  )  from .lnkgo import LnkGoIE +from .localnews8 import LocalNews8IE  from .lovehomeporn import LoveHomePornIE  from .lrt import LRTIE  from .lynda import ( @@ -407,6 +420,10 @@ from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mgoon import MgoonIE  from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( +    MicrosoftVirtualAcademyIE, +    MicrosoftVirtualAcademyCourseIE, +)  from .minhateca import MinhatecaIE  from .ministrygrid import MinistryGridIE  from .minoto import MinotoIE @@ -561,7 +578,10 @@ from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE  from .people import PeopleIE -from .periscope import PeriscopeIE +from .periscope import ( +    PeriscopeIE, +    PeriscopeUserIE, +)  from .philharmoniedeparis import PhilharmonieDeParisIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE @@ -602,6 +622,10 @@ from .qqmusic import (      QQMusicPlaylistIE,  )  from .r7 import R7IE +from .radiocanada import ( +    RadioCanadaIE, +    RadioCanadaAudioVideoIE, +)  from .radiode import RadioDeIE  from .radiojavan import RadioJavanIE  from .radiobremen import RadioBremenIE @@ -615,8 +639,12 @@ from .rds import RDSIE  from .redtube import RedTubeIE  from .regiotv import RegioTVIE  from .restudy import RestudyIE +from .reuters import ReutersIE  from .reverbnation import ReverbNationIE -from .revision3 import Revision3IE +from .revision3 import ( +    Revision3EmbedIE, +    Revision3IE, +)  from .rice import RICEIE  from .ringtv import RingTVIE  from .ro220 import Ro220IE @@ -655,7 +683,9 @@ from .screencast import ScreencastIE  from .screencastomatic import ScreencastOMaticIE  from .screenjunkies import ScreenJunkiesIE  from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .seeker import SeekerIE  from .senateisvp import SenateISVPIE +from .sendtonews import SendtoNewsIE  from .servingsys import ServingSysIE  from .sexu import SexuIE  from .shahid import ShahidIE @@ -758,6 +788,7 @@ from .thesixtyone import TheSixtyOneIE  from .thestar import TheStarIE  from .thisamericanlife import ThisAmericanLifeIE  from .thisav import ThisAVIE +from .threeqsdn import ThreeQSDNIE  from .tinypic import TinyPicIE  from .tlc import TlcDeIE  from .tmz import ( @@ -810,7 +841,10 @@ from .tvc import (  )  from .tvigle import TvigleIE  from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE +from .tvp import ( +    TVPIE, +    TVPSeriesIE, +)  from .tvplay import TVPlayIE  from .tweakers import TweakersIE  from .twentyfourvideo import TwentyFourVideoIE @@ -825,7 +859,6 @@ from .twitch import (      TwitchVodIE,      TwitchProfileIE,      TwitchPastBroadcastsIE, -    TwitchBookmarksIE,      TwitchStreamIE,  )  from .twitter import ( @@ -843,7 +876,10 @@ from .unistra import UnistraIE  from .urort import UrortIE  from .usatoday import USATodayIE  from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE +from .ustudio import ( +    UstudioIE, +    UstudioEmbedIE, +)  from .varzesh3 import Varzesh3IE  from .vbox7 import Vbox7IE  from .veehd import VeeHDIE @@ -875,6 +911,7 @@ from .videomore import (  )  from .videopremium import VideoPremiumIE  from .videott import VideoTtIE +from .vidio import VidioIE  from .vidme import (      VidmeIE,      VidmeUserIE, @@ -922,13 +959,15 @@ from .vube import VubeIE  from .vuclip import VuClipIE  from .vulture import VultureIE  from .walla import WallaIE -from .washingtonpost import WashingtonPostIE +from .washingtonpost import ( +    WashingtonPostIE, +    WashingtonPostArticleIE, +)  from .wat import WatIE  from .watchindianporn import WatchIndianPornIE  from .wdr import (      WDRIE,      WDRMobileIE, -    WDRMausIE,  )  from .webofstories import (      WebOfStoriesIE, @@ -975,7 +1014,10 @@ from .yesjapan import YesJapanIE  from .yinyuetai import YinYueTaiIE  from .ynet import YnetIE  from .youjizz import YouJizzIE -from .youku import YoukuIE +from .youku import ( +    YoukuIE, +    YoukuShowIE, +)  from .youporn import YouPornIE  from .yourupload import YourUploadIE  from .youtube import ( diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py new file mode 100644 index 000000000..2f3035147 --- /dev/null +++ b/youtube_dl/extractor/eyedotv.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    parse_duration, +    ExtractorError, +) + + +class EyedoTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301', +        'md5': 'ba14f17995cdfc20c36ba40e21bf73f7', +        'info_dict': { +            'id': '16301', +            'ext': 'mp4', +            'title': 'Journée du conseil scientifique de l\'Afnic 2015', +            'description': 'md5:4abe07293b2f73efc6e1c37028d58c98', +            'uploader': 'Afnic Live', +            'uploader_id': '8023', +        } +    } +    _ROOT_URL = 'http://live.eyedo.net:1935/' + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id) + +        def _add_ns(path): +            return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api') + +        title = xpath_text(video_data, _add_ns('Titre'), 'title', True) +        state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True) +        if state_live_code == 'avenir': +            raise ExtractorError( +                '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME, +                expected=True) + +        is_live = state_live_code == 'live' +        m3u8_url = None +        # http://eyedo.tv/Content/Html5/Scripts/html5view.js +        if is_live: +            if xpath_text(video_data, 'Cdn') == 'true': +                m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id +            else: +                m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id +        else: +            m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id) + +        return { +            'id': video_id, +            'title': title, +            'formats': self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'), +            'description': xpath_text(video_data, _add_ns('Description')), +            'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))), +            'uploader': xpath_text(video_data, _add_ns('Createur')), +            'uploader_id': xpath_text(video_data, _add_ns('CreateurId')), +            'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')), +            'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')), +        } diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import compat_urlparse  class FczenitIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'      _TEST = { -        'url': 'http://fc-zenit.ru/video/gl6785/', -        'md5': '458bacc24549173fe5a5aa29174a5606', +        'url': 'http://fc-zenit.ru/video/41044/', +        'md5': '0e3fab421b455e970fa1aa3891e57df0',          'info_dict': { -            'id': '6785', +            'id': '41044',              'ext': 'mp4', -            'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', +            'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',          },      } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title') +        video_title = self._html_search_regex( +            r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') + +        video_items = self._parse_json(self._search_regex( +            r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), +            video_id) -        bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') -        bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) +        def merge_dicts(*dicts): +            ret = {} +            for a_dict in dicts: +                ret.update(a_dict) +            return ret          formats = [{ -            'url': furl, -            'tbr': tbr, -        } for furl, tbr in bitrates] +            'url': compat_urlparse.urljoin(url, video_url), +            'tbr': int(tbr), +        } for tbr, video_url in merge_dicts(*video_items).items()]          self._sort_formats(formats) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor):              'upload_date': '20110423',              'uploader_id': '10922353@N03',              'uploader': 'Forest Wander', +            'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/',              'comment_count': int,              'view_count': int,              'tags': list, +            'license': 'Attribution-ShareAlike',          }      } -      _API_BASE_URL = 'https://api.flickr.com/services/rest?' +    # https://help.yahoo.com/kb/flickr/SLN25525.html +    _LICENSES = { +        '0': 'All Rights Reserved', +        '1': 'Attribution-NonCommercial-ShareAlike', +        '2': 'Attribution-NonCommercial', +        '3': 'Attribution-NonCommercial-NoDerivs', +        '4': 'Attribution', +        '5': 'Attribution-ShareAlike', +        '6': 'Attribution-NoDerivs', +        '7': 'No known copyright restrictions', +        '8': 'United States government work', +        '9': 'Public Domain Dedication (CC0)', +        '10': 'Public Domain Work', +    }      def _call_api(self, method, video_id, api_key, note, secret=None):          query = { @@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor):              self._sort_formats(formats)              owner = video_info.get('owner', {}) +            uploader_id = owner.get('nsid') +            uploader_path = owner.get('path_alias') or uploader_id +            uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None              return {                  'id': video_id, @@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor):                  'formats': formats,                  'timestamp': int_or_none(video_info.get('dateuploaded')),                  'duration': int_or_none(video_info.get('video', {}).get('duration')), -                'uploader_id': owner.get('nsid'), +                'uploader_id': uploader_id,                  'uploader': owner.get('realname'), +                'uploader_url': uploader_url,                  'comment_count': int_or_none(video_info.get('comments', {}).get('_content')),                  'view_count': int_or_none(video_info.get('views')), -                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] +                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], +                'license': self._LICENSES.get(video_info.get('license')),              }          else:              raise ExtractorError('not a video', expected=True) diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py new file mode 100644 index 000000000..322c41e5a --- /dev/null +++ b/youtube_dl/extractor/formula1.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Formula1IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' +    _TEST = { +        'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', +        'md5': '8c79e54be72078b26b89e0e111c0502b', +        'info_dict': { +            'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', +            'ext': 'flv', +            'title': 'Race highlights - Spain 2016', +        }, +        'add_ie': ['Ooyala'], +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        ooyala_embed_code = self._search_regex( +            r'data-videoid="([^"]+)"', webpage, 'ooyala embed code') +        return self.url_result( +            'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0f1eb7fa6..90575ab0e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -61,6 +61,9 @@ from .jwplatform import JWPlatformIE  from .digiteka import DigitekaIE  from .instagram import InstagramIE  from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .vessel import VesselIE  class GenericIE(InfoExtractor): @@ -716,15 +719,18 @@ class GenericIE(InfoExtractor):          },          # Wistia embed          { -            'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', -            'md5': '8788b683c777a5cf25621eaf286d0c23', +            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', +            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',              'info_dict': { -                'id': '1cfaf6b7ea', +                'id': '6e2wtrbdaf',                  'ext': 'mov', -                'title': 'md5:51364a8d3d009997ba99656004b5e20d', -                'duration': 643.0, -                'filesize': 182808282, -                'uploader': 'education-portal.com', +                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', +                'description': 'a Paywall Videos video from Remilon', +                'duration': 644.072, +                'uploader': 'study.com', +                'timestamp': 1459678540, +                'upload_date': '20160403', +                'filesize': 24687186,              },          },          { @@ -733,13 +739,29 @@ class GenericIE(InfoExtractor):              'info_dict': {                  'id': 'uxjb0lwrcz',                  'ext': 'mp4', -                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', +                'title': 'Conversation about Hexagonal Rails Part 1',                  'description': 'a Martin Fowler video from ThoughtWorks',                  'duration': 1715.0,                  'uploader': 'thoughtworks.wistia.com', -                'upload_date': '20140603',                  'timestamp': 1401832161, +                'upload_date': '20140603', +            }, +        }, +        # Wistia standard embed (async) +        { +            'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', +            'info_dict': { +                'id': '807fafadvk', +                'ext': 'mp4', +                'title': 'Drip Brennan Dunn Workshop', +                'description': 'a JV Webinars video from getdrip-1', +                'duration': 4986.95, +                'timestamp': 1463607249, +                'upload_date': '20160518',              }, +            'params': { +                'skip_download': True, +            }          },          # Soundcloud embed          { @@ -763,6 +785,19 @@ class GenericIE(InfoExtractor):                  'title': 'Rosetta #CometLanding webcast HL 10',              }          }, +        # Another Livestream embed, without 'new.' in URL +        { +            'url': 'https://www.freespeech.org/', +            'info_dict': { +                'id': '123537347', +                'ext': 'mp4', +                'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            }, +            'params': { +                # Live stream +                'skip_download': True, +            }, +        },          # LazyYT          {              'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', @@ -847,18 +882,6 @@ class GenericIE(InfoExtractor):                  'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',              }          }, -        # Kaltura embed -        { -            'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', -            'info_dict': { -                'id': '1_eergr3h1', -                'ext': 'mp4', -                'upload_date': '20150226', -                'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', -                'timestamp': int, -                'title': 'John Carlson Postgame 2/25/15', -            }, -        },          # Kaltura embed (different embed code)          {              'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', @@ -884,6 +907,19 @@ class GenericIE(InfoExtractor):                  'uploader_id': 'echojecka',              },          }, +        # Kaltura embed with single quotes +        { +            'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', +            'info_dict': { +                'id': '0_izeg5utt', +                'ext': 'mp4', +                'title': '35871', +                'timestamp': 1355743100, +                'upload_date': '20121217', +                'uploader_id': 'batchUser', +            }, +            'add_ie': ['Kaltura'], +        },          # Eagle.Platform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -998,14 +1034,18 @@ class GenericIE(InfoExtractor):          },          # UDN embed          { -            'url': 'http://www.udn.com/news/story/7314/822787', +            'url': 'https://video.udn.com/news/300346',              'md5': 'fd2060e988c326991037b9aff9df21a6',              'info_dict': {                  'id': '300346',                  'ext': 'mp4',                  'title': '中一中男師變性 全校師生力挺',                  'thumbnail': 're:^https?://.*\.jpg$', -            } +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          # Ooyala embed          { @@ -1173,6 +1213,16 @@ class GenericIE(InfoExtractor):                  'uploader': 'Lake8737',              }          }, +        # Duplicated embedded video URLs +        { +            'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', +            'info_dict': { +                'id': '149298443_480_16c25b74_2', +                'ext': 'mp4', +                'title': 'vs. Blue Orange Spring Game', +                'uploader': 'www.hudl.com', +            }, +        },      ]      def report_following_redirect(self, new_url): @@ -1427,7 +1477,8 @@ class GenericIE(InfoExtractor):          #   Site Name | Video Title          #   Video Title - Tagline | Site Name          # and so on and so forth; it's just not practical -        video_title = self._html_search_regex( +        video_title = self._og_search_title( +            webpage, default=None) or self._html_search_regex(              r'(?s)<title>(.*?)</title>', webpage, 'video title',              default='video') @@ -1445,6 +1496,9 @@ class GenericIE(InfoExtractor):          video_uploader = self._search_regex(              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') +        video_description = self._og_search_description(webpage, default=None) +        video_thumbnail = self._og_search_thumbnail(webpage, default=None) +          # Helper method          def _playlist_from_matches(matches, getter=None, ie=None):              urlrs = orderedSet( @@ -1475,6 +1529,16 @@ class GenericIE(InfoExtractor):          if bc_urls:              return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +        # Look for ThePlatform embeds +        tp_urls = ThePlatformIE._extract_urls(webpage) +        if tp_urls: +            return _playlist_from_matches(tp_urls, ie='ThePlatform') + +        # Look for Vessel embeds +        vessel_urls = VesselIE._extract_urls(webpage) +        if vessel_urls: +            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) +          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1543,21 +1607,26 @@ class GenericIE(InfoExtractor):                  'url': embed_url,                  'ie_key': 'Wistia',                  'uploader': video_uploader, -                'title': video_title, -                'id': video_id,              }          match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)          if match:              return {                  '_type': 'url_transparent', -                'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), +                'url': 'wistia:%s' % match.group('id'),                  'ie_key': 'Wistia',                  'uploader': video_uploader, -                'title': video_title, -                'id': match.group('id')              } +        match = re.search( +            r'''(?sx) +                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? +                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 +            ''', webpage) +        if match: +            return self.url_result(self._proto_relative_url( +                'wistia:%s' % match.group('id')), 'Wistia') +          # Look for SVT player          svt_url = SVTIE._extract_url(webpage)          if svt_url: @@ -1833,7 +1902,7 @@ class GenericIE(InfoExtractor):              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')          mobj = re.search( -            r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"', +            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',              webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'Livestream') @@ -1845,7 +1914,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or +        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or                  re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))          if mobj is not None:              return self.url_result(smuggle_url( @@ -1983,6 +2052,19 @@ class GenericIE(InfoExtractor):          if liveleak_url:              return self.url_result(liveleak_url, 'LiveLeak') +        # Look for 3Q SDN embeds +        threeqsdn_url = ThreeQSDNIE._extract_url(webpage) +        if threeqsdn_url: +            return { +                '_type': 'url_transparent', +                'ie_key': ThreeQSDNIE.ie_key(), +                'url': self._proto_relative_url(threeqsdn_url), +                'title': video_title, +                'description': video_description, +                'thumbnail': video_thumbnail, +                'uploader': video_uploader, +            } +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True @@ -2063,7 +2145,7 @@ class GenericIE(InfoExtractor):              raise UnsupportedError(url)          entries = [] -        for video_url in found: +        for video_url in orderedSet(found):              video_url = unescapeHTML(video_url)              video_url = video_url.replace('\\/', '/')              video_url = compat_urlparse.urljoin(url, video_url) diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index f6b69662b..a6da90931 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -4,7 +4,7 @@ from .common import InfoExtractor  class GrouponIE(InfoExtractor): -    _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)' +    _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)'      _TEST = {          'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf', @@ -14,17 +14,27 @@ class GrouponIE(InfoExtractor):              'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',          },          'playlist': [{ +            'md5': '42428ce8a00585f9bc36e49226eae7a1',              'info_dict': { -                'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', -                'ext': 'flv', -                'title': 'Bikram Yoga Huntington Beach | Orange County', +                'id': 'fk6OhWpXgIQ', +                'ext': 'mp4', +                'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf',                  'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', -                'duration': 44.961, +                'duration': 45, +                'upload_date': '20160405', +                'uploader_id': 'groupon', +                'uploader': 'Groupon',              }, +            'add_ie': ['Youtube'],          }],          'params': { -            'skip_download': 'HDS', -        } +            'skip_download': True, +        }, +    } + +    _PROVIDERS = { +        'ooyala': ('ooyala:%s', 'Ooyala'), +        'youtube': ('%s', 'Youtube'),      }      def _real_extract(self, url): @@ -36,12 +46,17 @@ class GrouponIE(InfoExtractor):          videos = payload['carousel'].get('dealVideos', [])          entries = []          for v in videos: -            if v.get('provider') != 'OOYALA': +            provider = v.get('provider') +            video_id = v.get('media') or v.get('id') or v.get('baseURL') +            if not provider or not video_id: +                continue +            url_pattern, ie_key = self._PROVIDERS.get(provider.lower()) +            if not url_pattern:                  self.report_warning(                      '%s: Unsupported video provider %s, skipping video' % -                    (playlist_id, v.get('provider'))) +                    (playlist_id, provider))                  continue -            entries.append(self.url_result('ooyala:%s' % v['media'])) +            entries.append(self.url_result(url_pattern % video_id, ie_key))          return {              '_type': 'playlist', diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py index 7d8698655..256453882 100644 --- a/youtube_dl/extractor/hearthisat.py +++ b/youtube_dl/extractor/hearthisat.py @@ -7,6 +7,7 @@ from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import (      HEADRequest, +    KNOWN_EXTENSIONS,      sanitized_Request,      str_to_int,      urlencode_postdata, @@ -17,7 +18,7 @@ from ..utils import (  class HearThisAtIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'      _PLAYLIST_URL = 'https://hearthis.at/playlist.php' -    _TEST = { +    _TESTS = [{          'url': 'https://hearthis.at/moofi/dr-kreep',          'md5': 'ab6ec33c8fed6556029337c7885eb4e0',          'info_dict': { @@ -26,7 +27,7 @@ class HearThisAtIE(InfoExtractor):              'title': 'Moofi - Dr. Kreep',              'thumbnail': 're:^https?://.*\.jpg$',              'timestamp': 1421564134, -            'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.', +            'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',              'upload_date': '20150118',              'comment_count': int,              'view_count': int, @@ -34,7 +35,25 @@ class HearThisAtIE(InfoExtractor):              'duration': 71,              'categories': ['Experimental'],          } -    } +    }, { +        # 'download' link redirects to the original webpage +        'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/', +        'md5': '5980ceb7c461605d30f1f039df160c6e', +        'info_dict': { +            'id': '811296', +            'ext': 'mp3', +            'title': 'TwitchSF - DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix!', +            'description': 'Listen to DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance', +            'upload_date': '20160328', +            'timestamp': 1459186146, +            'thumbnail': 're:^https?://.*\.jpg$', +            'comment_count': int, +            'view_count': int, +            'like_count': int, +            'duration': 4360, +            'categories': ['Dance'], +        }, +    }]      def _real_extract(self, url):          m = re.match(self._VALID_URL, url) @@ -90,13 +109,14 @@ class HearThisAtIE(InfoExtractor):              ext_handle = self._request_webpage(                  ext_req, display_id, note='Determining extension')              ext = urlhandle_detect_ext(ext_handle) -            formats.append({ -                'format_id': 'download', -                'vcodec': 'none', -                'ext': ext, -                'url': download_url, -                'preference': 2,  # Usually better quality -            }) +            if ext in KNOWN_EXTENSIONS: +                formats.append({ +                    'format_id': 'download', +                    'vcodec': 'none', +                    'ext': ext, +                    'url': download_url, +                    'preference': 2,  # Usually better quality +                })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index e8f51e545..7e36b85ad 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'      _TEST = {          'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', -        'md5': '8b743df908c42f60cf6496586c7f12c3', +        'md5': '7d45932269a288149483144f01b99789',          'info_dict': {              'id': '390161',              'ext': 'mp4', @@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor):              'duration': 56.823,          },          'params': { -            # m3u8 download              'skip_download': True,          }, +        'add_ie': ['Ooyala'],      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 8bed8ccd0..3a2b7cec5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,10 +1,10 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import ( +    mimetype2ext,      qualities,  ) @@ -12,9 +12,9 @@ from ..utils import (  class ImdbIE(InfoExtractor):      IE_NAME = 'imdb'      IE_DESC = 'Internet Movie Database trailers' -    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.imdb.com/video/imdb/vi2524815897',          'info_dict': {              'id': '2524815897', @@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor):              'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',              'description': 'md5:9061c2219254e5d14e03c25c98e96a81',          } -    } +    }, { +        'url': 'http://www.imdb.com/video/_/vi2524815897', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -48,13 +51,27 @@ class ImdbIE(InfoExtractor):              json_data = self._search_regex(                  r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',                  format_page, 'json data', flags=re.DOTALL) -            info = json.loads(json_data) -            format_info = info['videoPlayerObject']['video'] -            f_id = format_info['ffname'] +            info = self._parse_json(json_data, video_id, fatal=False) +            if not info: +                continue +            format_info = info.get('videoPlayerObject', {}).get('video', {}) +            if not format_info: +                continue +            video_info_list = format_info.get('videoInfoList') +            if not video_info_list or not isinstance(video_info_list, list): +                continue +            video_info = video_info_list[0] +            if not video_info or not isinstance(video_info, dict): +                continue +            video_url = video_info.get('videoUrl') +            if not video_url: +                continue +            format_id = format_info.get('ffname')              formats.append({ -                'format_id': f_id, -                'url': format_info['videoInfoList'][0]['videoUrl'], -                'quality': quality(f_id), +                'format_id': format_id, +                'url': video_url, +                'ext': mimetype2ext(video_info.get('videoMimeType')), +                'quality': quality(format_id),              })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ffb8008ce..ddcb3c916 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -505,7 +505,10 @@ class IqiyiIE(InfoExtractor):              'enc': md5_text(enc_key + tail),              'qyid': _uuid,              'tn': random.random(), -            'um': 0, +            # In iQiyi's flash player, um is set to 1 if there's a logged user +            # Some 1080P formats are only available with a logged user. +            # Here force um=1 to trick the iQiyi server +            'um': 1,              'authkey': md5_text(md5_text('') + tail),              'k_tag': 1,          } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 8a5e562db..fa6f335e1 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -5,33 +5,50 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      float_or_none,      int_or_none,  )  class JWPlatformBaseIE(InfoExtractor): -    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): +    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None):          video_data = jwplayer_data['playlist'][0]          formats = []          for source in video_data['sources']:              source_url = self._proto_relative_url(source['file'])              source_type = source.get('type') or '' -            if source_type in ('application/vnd.apple.mpegurl', 'hls'): +            if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8':                  formats.extend(self._extract_m3u8_formats( -                    source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) +                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))              elif source_type.startswith('audio'):                  formats.append({                      'url': source_url,                      'vcodec': 'none',                  })              else: -                formats.append({ +                a_format = {                      'url': source_url,                      'width': int_or_none(source.get('width')),                      'height': int_or_none(source.get('height')), -                }) +                } +                if source_url.startswith('rtmp'): +                    a_format['ext'] = 'flv', + +                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as +                    # of jwplayer.flash.swf +                    rtmp_url_parts = re.split( +                        r'((?:mp4|mp3|flv):)', source_url, 1) +                    if len(rtmp_url_parts) == 3: +                        rtmp_url, prefix, play_path = rtmp_url_parts +                        a_format.update({ +                            'url': rtmp_url, +                            'play_path': prefix + play_path, +                        }) +                    if rtmp_params: +                        a_format.update(rtmp_params) +                formats.append(a_format)          self._sort_formats(formats)          subtitles = {} diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 616ed19e1..11b31a699 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor):          'info_dict': {              'id': '86375',              'title': '八十年代精选', +            'description': '这些都是属于八十年代的回忆!',          },          'playlist_mincount': 24,      } diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py new file mode 100644 index 000000000..1435e090e --- /dev/null +++ b/youtube_dl/extractor/learnr.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LearnrIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript', +        'md5': '3719fdf0a68397f49899e82c308a89de', +        'info_dict': { +            'id': '51624', +            'ext': 'mp4', +            'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript', +            'description': 'md5:b36dbfa92350176cdf12b4d388485503', +            'uploader': 'LearnCode.academy', +            'uploader_id': 'learncodeacademy', +            'upload_date': '20131021', +        }, +        'add_ie': ['Youtube'], +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        return { +            '_type': 'url_transparent', +            'url': self._search_regex( +                r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'), +            'id': video_id, +        } diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py new file mode 100644 index 000000000..0a94366fd --- /dev/null +++ b/youtube_dl/extractor/libraryofcongress.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( +    determine_ext, +    float_or_none, +    int_or_none, +    parse_filesize, +) + + +class LibraryOfCongressIE(InfoExtractor): +    IE_NAME = 'loc' +    IE_DESC = 'Library of Congress' +    _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' +    _TESTS = [{ +        # embedded via <div class="media-player" +        'url': 'http://loc.gov/item/90716351/', +        'md5': '353917ff7f0255aa6d4b80a034833de8', +        'info_dict': { +            'id': '90716351', +            'ext': 'mp4', +            'title': "Pa's trip to Mars", +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 0, +            'view_count': int, +        }, +    }, { +        # webcast embedded via mediaObjectId +        'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578', +        'info_dict': { +            'id': '5578', +            'ext': 'mp4', +            'title': 'Help! Preservation Training Needs Here, There & Everywhere', +            'duration': 3765, +            'view_count': int, +            'subtitles': 'mincount:1', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # with direct download links +        'url': 'https://www.loc.gov/item/78710669/', +        'info_dict': { +            'id': '78710669', +            'ext': 'mp4', +            'title': 'La vie et la passion de Jesus-Christ', +            'duration': 0, +            'view_count': int, +            'formats': 'mincount:4', +        }, +        'params': { +            'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        media_id = self._search_regex( +            (r'id=(["\'])media-player-(?P<id>.+?)\1', +             r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', +             r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', +             r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), +            webpage, 'media id', group='id') + +        data = self._download_json( +            'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, +            video_id)['mediaObject'] + +        derivative = data['derivatives'][0] +        media_url = derivative['derivativeUrl'] + +        title = derivative.get('shortName') or data.get('shortName') or self._og_search_title( +            webpage) + +        # Following algorithm was extracted from setAVSource js function +        # found in webpage +        media_url = media_url.replace('rtmp', 'https') + +        is_video = data.get('mediaType', 'v').lower() == 'v' +        ext = determine_ext(media_url) +        if ext not in ('mp4', 'mp3'): +            media_url += '.mp4' if is_video else '.mp3' + +        if 'vod/mp4:' in media_url: +            formats = [{ +                'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', +                'format_id': 'hls', +                'ext': 'mp4', +                'protocol': 'm3u8_native', +                'quality': 1, +            }] +        elif 'vod/mp3:' in media_url: +            formats = [{ +                'url': media_url.replace('vod/mp3:', ''), +                'vcodec': 'none', +            }] + +        download_urls = set() +        for m in re.finditer( +                r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): +            format_id = m.group('id').lower() +            if format_id == 'gif': +                continue +            download_url = m.group('url') +            if download_url in download_urls: +                continue +            download_urls.add(download_url) +            formats.append({ +                'url': download_url, +                'format_id': format_id, +                'filesize_approx': parse_filesize(m.group('size')), +            }) + +        self._sort_formats(formats) + +        duration = float_or_none(data.get('duration')) +        view_count = int_or_none(data.get('viewCount')) + +        subtitles = {} +        cc_url = data.get('ccUrl') +        if cc_url: +            subtitles.setdefault('en', []).append({ +                'url': cc_url, +                'ext': 'ttml', +            }) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': self._og_search_thumbnail(webpage, default=None), +            'duration': duration, +            'view_count': view_count, +            'formats': formats, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index ba2f80a75..c2b4490c4 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -7,48 +7,53 @@ from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import (      determine_ext, +    ExtractorError,      int_or_none, +    parse_iso8601,      remove_end, -    unified_strdate, -    ExtractorError,  )  class LifeNewsIE(InfoExtractor): -    IE_NAME = 'lifenews' -    IE_DESC = 'LIFE | NEWS' -    _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' +    IE_NAME = 'life' +    IE_DESC = 'Life.ru' +    _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'      _TESTS = [{          # single video embedded via video/source -        'url': 'http://lifenews.ru/news/98736', +        'url': 'https://life.ru/t/новости/98736',          'md5': '77c95eaefaca216e32a76a343ad89d23',          'info_dict': {              'id': '98736',              'ext': 'mp4',              'title': 'Мужчина нашел дома архив оборонного завода',              'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', +            'timestamp': 1344154740,              'upload_date': '20120805', +            'view_count': int,          }      }, {          # single video embedded via iframe -        'url': 'http://lifenews.ru/news/152125', +        'url': 'https://life.ru/t/новости/152125',          'md5': '77d19a6f0886cd76bdbf44b4d971a273',          'info_dict': {              'id': '152125',              'ext': 'mp4',              'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',              'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', +            'timestamp': 1427961840,              'upload_date': '20150402', +            'view_count': int,          }      }, {          # two videos embedded via iframe -        'url': 'http://lifenews.ru/news/153461', +        'url': 'https://life.ru/t/новости/153461',          'info_dict': {              'id': '153461',              'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',              'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', -            'upload_date': '20150505', +            'timestamp': 1430825520, +            'view_count': int,          },          'playlist': [{              'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', @@ -57,6 +62,7 @@ class LifeNewsIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',                  'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', +                'timestamp': 1430825520,                  'upload_date': '20150505',              },          }, { @@ -66,22 +72,25 @@ class LifeNewsIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',                  'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', +                'timestamp': 1430825520,                  'upload_date': '20150505',              },          }],      }, { -        'url': 'http://lifenews.ru/video/13035', +        'url': 'https://life.ru/t/новости/213035', +        'only_matching': True, +    }, { +        'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461', +        'only_matching': True, +    }, { +        'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',          'only_matching': True,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        section = mobj.group('section') +        video_id = self._match_id(url) -        webpage = self._download_webpage( -            'http://lifenews.ru/%s/%s' % (section, video_id), -            video_id, 'Downloading page') +        webpage = self._download_webpage(url, video_id)          video_urls = re.findall(              r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) @@ -95,26 +104,22 @@ class LifeNewsIE(InfoExtractor):          title = remove_end(              self._og_search_title(webpage), -            ' - Первый по срочным новостям — LIFE | NEWS') +            ' - Life.ru')          description = self._og_search_description(webpage)          view_count = self._html_search_regex( -            r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False) -        comment_count = self._html_search_regex( -            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<', -            webpage, 'comment count', fatal=False) +            r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>', +            webpage, 'view count', fatal=False, group='value') -        upload_date = self._html_search_regex( -            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False) -        if upload_date is not None: -            upload_date = unified_strdate(upload_date) +        timestamp = parse_iso8601(self._search_regex( +            r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1', +            webpage, 'upload date', fatal=False, group='value'))          common_info = {              'description': description,              'view_count': int_or_none(view_count), -            'comment_count': int_or_none(comment_count), -            'upload_date': upload_date, +            'timestamp': timestamp,          }          def make_entry(video_id, video_url, index=None): @@ -183,7 +188,8 @@ class LifeEmbedIE(InfoExtractor):              ext = determine_ext(video_url)              if ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats( -                    video_url, video_id, 'mp4', m3u8_id='m3u8')) +                    video_url, video_id, 'mp4', +                    entry_protocol='m3u8_native', m3u8_id='m3u8'))              else:                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    smuggle_url, +    unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): +    _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)' + +    _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + +    _TESTS = [{ +        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', +        'info_dict': { +            'id': 'VOD00041606', +            'title': '花千骨', +        }, +        'playlist_count': 50, +    }, { +        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', +        'info_dict': { +            'id': 'VOD00041610', +            'ext': 'mp4', +            'title': '花千骨第1集', +            'thumbnail': 're:https?://.*\.jpg$', +            'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', +            'episode_number': 1, +        }, +        'params': { +            'noplaylist': True, +            'skip_download': True,  # m3u8 download +        }, +        'skip': 'Georestricted to Taiwan', +    }] + +    def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): +        episode_title = view_data['title'] +        content_id = season_list['contentId'] + +        if prompt: +            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + +        all_episodes = [ +            self.url_result(smuggle_url( +                self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), +                {'force_noplaylist': True}))  # To prevent infinite recursion +            for episode in season_list['episode']] + +        return self.playlist_result(all_episodes, content_id, episode_title) + +    def _real_extract(self, url): +        url, data = unsmuggle_url(url, {}) + +        video_id = self._match_id(url) + +        noplaylist = self._downloader.params.get('noplaylist') +        noplaylist_prompt = True +        if 'force_noplaylist' in data: +            noplaylist = data['force_noplaylist'] +            noplaylist_prompt = False + +        webpage = self._download_webpage(url, video_id) + +        view_data = dict(map(lambda t: (t[0], t[2]), re.findall( +            r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', +            webpage))) + +        vod_data = self._parse_json(self._search_regex( +            'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), +            video_id) + +        season_list = list(vod_data.get('seasonList', {}).values()) +        if season_list: +            if not noplaylist: +                return self._extract_playlist( +                    season_list[0], video_id, vod_data, view_data, +                    prompt=noplaylist_prompt) + +            if noplaylist_prompt: +                self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + +        # In browsers `getMainUrl` request is always issued. Usually this +        # endpoint gives the same result as the data embedded in the webpage. +        # If georestricted, there are no embedded data, so an extra request is +        # necessary to get the error code +        video_data = self._parse_json(self._search_regex( +            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', +            webpage, 'video data', default='{}'), video_id) +        if not video_data: +            payload = { +                'assetId': view_data['assetId'], +                'watchDevices': vod_data['watchDevices'], +                'contentType': view_data['contentType'], +            } +            video_data = self._download_json( +                'https://www.litv.tv/vod/getMainUrl', video_id, +                data=json.dumps(payload).encode('utf-8'), +                headers={'Content-Type': 'application/json'}) + +        if not video_data.get('fullpath'): +            error_msg = video_data.get('errorMessage') +            if error_msg == 'vod.error.outsideregionerror': +                self.raise_geo_restricted('This video is available in Taiwan only') +            if error_msg: +                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) +            raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + +        formats = self._extract_m3u8_formats( +            video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') +        for a_format in formats: +            # LiTV HLS segments doesn't like compressions +            a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + +        title = view_data['title'] + view_data.get('secondaryMark', '') +        description = view_data.get('description') +        thumbnail = view_data.get('imageFile') +        categories = [item['name'] for item in vod_data.get('category', [])] +        episode = int_or_none(view_data.get('episode')) + +        return { +            'id': video_id, +            'formats': formats, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'categories': categories, +            'episode_number': episode, +        } diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 29fba5f30..ea0565ac0 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor):              'ext': 'flv',              'description': 'extremely bad day for this guy..!',              'uploader': 'ljfriel2', -            'title': 'Most unlucky car accident' +            'title': 'Most unlucky car accident', +            'thumbnail': 're:^https?://.*\.jpg$'          }      }, {          'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor):              'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',              'uploader': 'ARD_Stinkt',              'title': 'German Television does first Edward Snowden Interview (ENGLISH)', +            'thumbnail': 're:^https?://.*\.jpg$'          }      }, {          'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor):              'ext': 'mp4',              'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',              'uploader': 'bony333', -            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' +            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', +            'thumbnail': 're:^https?://.*\.jpg$'          }      }] @@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor):          age_limit = int_or_none(self._search_regex(              r'you confirm that you are ([0-9]+) years and over.',              webpage, 'age limit', default=None)) +        video_thumbnail = self._og_search_thumbnail(webpage)          sources_raw = self._search_regex(              r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) @@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor):              'uploader': video_uploader,              'formats': formats,              'age_limit': age_limit, +            'thumbnail': video_thumbnail,          } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index eada7c299..bc7894bf1 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -150,7 +150,7 @@ class LivestreamIE(InfoExtractor):          }      def _extract_stream_info(self, stream_info): -        broadcast_id = stream_info['broadcast_id'] +        broadcast_id = compat_str(stream_info['broadcast_id'])          is_live = stream_info.get('is_live')          formats = [] @@ -203,9 +203,10 @@ class LivestreamIE(InfoExtractor):              if not videos_info:                  break              for v in videos_info: +                v_id = compat_str(v['id'])                  entries.append(self.url_result( -                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']), -                    'Livestream', v['id'], v['caption'])) +                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), +                    'Livestream', v_id, v.get('caption')))              last_video = videos_info[-1]['id']          return self.playlist_result(entries, event_id, event_data['full_name']) diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py new file mode 100644 index 000000000..aad396135 --- /dev/null +++ b/youtube_dl/extractor/localnews8.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class LocalNews8IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', +        'md5': 'be4d48aea61aa2bde7be2ee47691ad20', +        'info_dict': { +            'id': '35183304', +            'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', +            'ext': 'mp4', +            'title': 'Rexburg business turns carbon fiber scraps into wedding ring', +            'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', +            'duration': 153, +            'timestamp': 1441844822, +            'upload_date': '20150910', +            'uploader_id': 'api', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, display_id) + +        partner_id = self._search_regex( +            r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1', +            webpage, 'partner id', group='id') +        kaltura_id = self._search_regex( +            r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1', +            webpage, 'videl id', group='id') + +        return { +            '_type': 'url_transparent', +            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), +            'ie_key': 'Kaltura', +            'id': video_id, +            'display_id': display_id, +        } diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index a14d176a5..9fbc74f5d 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor):      _TEST = {          'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', -        'md5': '', +        'md5': '1bdadcf760a0b90946ca68ee9a2db41a',          'info_dict': {              'id': '3116640',              'ext': 'mp4', @@ -20,15 +20,6 @@ class MGTVIE(InfoExtractor):              'duration': 7461,              'thumbnail': 're:^https?://.*\.jpg$',          }, -        'params': { -            'skip_download': True,  # m3u8 download -        }, -    } - -    _FORMAT_MAP = { -        '标清': ('Standard', 0), -        '高清': ('High', 1), -        '超清': ('SuperHigh', 2),      }      def _real_extract(self, url): @@ -40,17 +31,27 @@ class MGTVIE(InfoExtractor):          formats = []          for idx, stream in enumerate(api_data['stream']): -            format_name = stream.get('name') -            format_id, preference = self._FORMAT_MAP.get(format_name, (None, None)) -            format_info = self._download_json( -                stream['url'], video_id, -                note='Download video info for format %s' % format_id or '#%d' % idx) -            formats.append({ -                'format_id': format_id, -                'url': format_info['info'], -                'ext': 'mp4',  # These are m3u8 playlists -                'preference': preference, -            }) +            stream_url = stream.get('url') +            if not stream_url: +                continue +            tbr = int_or_none(self._search_regex( +                r'(\d+)\.mp4', stream_url, 'tbr', default=None)) + +            def extract_format(stream_url, format_id, idx, query={}): +                format_info = self._download_json( +                    stream_url, video_id, +                    note='Download video info for format %s' % format_id or '#%d' % idx, query=query) +                return { +                    'format_id': format_id, +                    'url': format_info['info'], +                    'ext': 'mp4', +                    'tbr': tbr, +                } + +            formats.append(extract_format( +                stream_url, 'hls-%d' % tbr if tbr else None, idx * 2)) +            formats.append(extract_format(stream_url.replace( +                '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031}))          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..afd3e98ec --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_xpath, +) +from ..utils import ( +    int_or_none, +    parse_duration, +    smuggle_url, +    unsmuggle_url, +    xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): +    def _extract_base_url(self, course_id, display_id): +        return self._download_json( +            'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, +            display_id, 'Downloading course base URL') + +    def _extract_chapter_and_title(self, title): +        if not title: +            return None, None +        m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) +        return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): +    IE_NAME = 'mva' +    IE_DESC = 'Microsoft Virtual Academy videos' +    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + +    _TESTS = [{ +        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', +        'md5': '7826c44fc31678b12ad8db11f6b5abb9', +        'info_dict': { +            'id': 'gfVXISmEB_6804984382', +            'ext': 'mp4', +            'title': 'Course Introduction', +            'formats': 'mincount:3', +            'subtitles': { +                'en': [{ +                    'ext': 'ttml', +                }], +            }, +        } +    }, { +        'url': 'mva:11788:gfVXISmEB_6804984382', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) + +        mobj = re.match(self._VALID_URL, url) +        course_id = mobj.group('course_id') +        video_id = mobj.group('id') + +        base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + +        settings = self._download_xml( +            '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), +            video_id, 'Downloading video settings XML') + +        _, title = self._extract_chapter_and_title(xpath_text( +            settings, './/Title', 'title', fatal=True)) + +        formats = [] + +        for sources in settings.findall(compat_xpath('.//MediaSources')): +            if sources.get('videoType') == 'smoothstreaming': +                continue +            for source in sources.findall(compat_xpath('./MediaSource')): +                video_url = source.text +                if not video_url or not video_url.startswith('http'): +                    continue +                video_mode = source.get('videoMode') +                height = int_or_none(self._search_regex( +                    r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) +                codec = source.get('codec') +                acodec, vcodec = [None] * 2 +                if codec: +                    codecs = codec.split(',') +                    if len(codecs) == 2: +                        acodec, vcodec = codecs +                    elif len(codecs) == 1: +                        vcodec = codecs[0] +                formats.append({ +                    'url': video_url, +                    'format_id': video_mode, +                    'height': height, +                    'acodec': acodec, +                    'vcodec': vcodec, +                }) +        self._sort_formats(formats) + +        subtitles = {} +        for source in settings.findall(compat_xpath('.//MarkerResourceSource')): +            subtitle_url = source.text +            if not subtitle_url: +                continue +            subtitles.setdefault('en', []).append({ +                'url': '%s/%s' % (base_url, subtitle_url), +                'ext': source.get('type'), +            }) + +        return { +            'id': video_id, +            'title': title, +            'subtitles': subtitles, +            'formats': formats +        } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): +    IE_NAME = 'mva:course' +    IE_DESC = 'Microsoft Virtual Academy courses' +    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + +    _TESTS = [{ +        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', +        'info_dict': { +            'id': '11788', +            'title': 'Microsoft Azure Fundamentals: Virtual Machines', +        }, +        'playlist_count': 36, +    }, { +        # with emphasized chapters +        'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', +        'info_dict': { +            'id': '16335', +            'title': 'Developing Windows 10 Games with Construct 2', +        }, +        'playlist_count': 10, +    }, { +        'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', +        'only_matching': True, +    }, { +        'url': 'mva:course:11788', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if MicrosoftVirtualAcademyIE.suitable(url) else super( +            MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        course_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        base_url = self._extract_base_url(course_id, display_id) + +        manifest = self._download_json( +            '%s/imsmanifestlite.json' % base_url, +            display_id, 'Downloading course manifest JSON')['manifest'] + +        organization = manifest['organizations']['organization'][0] + +        entries = [] +        for chapter in organization['item']: +            chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) +            chapter_id = chapter.get('@identifier') +            for item in chapter.get('item', []): +                item_id = item.get('@identifier') +                if not item_id: +                    continue +                metadata = item.get('resource', {}).get('metadata') or {} +                if metadata.get('learningresourcetype') != 'Video': +                    continue +                _, title = self._extract_chapter_and_title(item.get('title')) +                duration = parse_duration(metadata.get('duration')) +                description = metadata.get('description') +                entries.append({ +                    '_type': 'url_transparent', +                    'url': smuggle_url( +                        'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), +                    'title': title, +                    'description': description, +                    'duration': duration, +                    'chapter': chapter_title, +                    'chapter_number': chapter_number, +                    'chapter_id': chapter_id, +                }) + +        title = organization.get('title') or manifest.get('metadata', {}).get('title') + +        return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 46504cd5f..f27c7f139 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -67,6 +67,23 @@ class NBCIE(InfoExtractor):              # This video has expired but with an escaped embedURL              'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',              'only_matching': True, +        }, +        { +            # HLS streams requires the 'hdnea3' cookie +            'url': 'http://www.nbc.com/Kings/video/goliath/n1806', +            'info_dict': { +                'id': 'n1806', +                'ext': 'mp4', +                'title': 'Goliath', +                'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', +                'timestamp': 1237100400, +                'upload_date': '20090315', +                'uploader': 'NBCU-COM', +            }, +            'params': { +                'skip_download': True, +            }, +            'skip': 'Only works from US',          }      ] diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 2a1ca80df..96528f649 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,19 +1,18 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import ( -    month_by_name,      int_or_none, +    remove_end, +    unified_strdate,  )  class NDTVIE(InfoExtractor): -    _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)'      _TEST = { -        'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710', +        'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710',          'md5': '39f992dbe5fb531c395d8bbedb1e5e88',          'info_dict': {              'id': '300710', @@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor):              'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02',              'upload_date': '20131208',              'duration': 1327, -            'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg', +            'thumbnail': 're:https?://.*\.jpg',          },      } @@ -30,36 +29,19 @@ class NDTVIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        title = remove_end(self._og_search_title(webpage), ' - NDTV') +          filename = self._search_regex(              r"__filename='([^']+)'", webpage, 'video filename') -        video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % -                     filename) +        video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename          duration = int_or_none(self._search_regex(              r"__duration='([^']+)'", webpage, 'duration', fatal=False)) -        date_m = re.search(r'''(?x) -            <p\s+class="vod_dateline">\s* -                Published\s+On:\s* -                (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+) -            ''', webpage) -        upload_date = None - -        if date_m is not None: -            month = month_by_name(date_m.group('monthname')) -            if month is not None: -                upload_date = '%s%02d%02d' % ( -                    date_m.group('year'), month, int(date_m.group('day'))) - -        description = self._og_search_description(webpage) -        READ_MORE = ' (Read more)' -        if description.endswith(READ_MORE): -            description = description[:-len(READ_MORE)] +        upload_date = unified_strdate(self._html_search_meta( +            'publish-date', webpage, 'upload date', fatal=False)) -        title = self._og_search_title(webpage) -        TITLE_SUFFIX = ' - NDTV' -        if title.endswith(TITLE_SUFFIX): -            title = title[:-len(TITLE_SUFFIX)] +        description = remove_end(self._og_search_description(webpage), ' (Read more)')          return {              'id': video_id, diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 51e4a34f7..adcc636bc 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -2,8 +2,12 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import ( -    sanitized_Request, +    clean_html, +    determine_ext, +    int_or_none, +    qualities,      urlencode_postdata, +    xpath_text,  ) @@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):          'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',          'info_dict': {              'id': 'qallunaat_why_white_people_are_funny', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Qallunaat! Why White People Are Funny ', -            'description': 'md5:836d8aff55e087d04d9f6df554d4e038', +            'description': 'md5:6b8e32dde3abf91e58857b174916620c',              'duration': 3128, +            'creator': 'Mark Sandiford',              'uploader': 'Mark Sandiford', -            'uploader_id': 'mark-sandiford',          },          'params': {              # rtmp download @@ -31,65 +35,78 @@ class NFBIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        page = self._download_webpage( -            'https://www.nfb.ca/film/%s' % video_id, video_id, -            'Downloading film page') -        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"', -                                              page, 'director id', fatal=False) -        uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>', -                                           page, 'director name', fatal=False) - -        request = sanitized_Request( +        config = self._download_xml(              'https://www.nfb.ca/film/%s/player_config' % video_id, -            urlencode_postdata({'getConfig': 'true'})) -        request.add_header('Content-Type', 'application/x-www-form-urlencoded') -        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') - -        config = self._download_xml(request, video_id, 'Downloading player config XML') +            video_id, 'Downloading player config XML', +            data=urlencode_postdata({'getConfig': 'true'}), +            headers={ +                'Content-Type': 'application/x-www-form-urlencoded', +                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf' +            }) -        title = None -        description = None -        thumbnail = None -        duration = None -        formats = [] - -        def extract_thumbnail(media): -            thumbnails = {} -            for asset in media.findall('assets/asset'): -                thumbnails[asset.get('quality')] = asset.find('default/url').text -            if not thumbnails: -                return None -            if 'high' in thumbnails: -                return thumbnails['high'] -            return list(thumbnails.values())[0] +        title, description, thumbnail, duration, uploader, author = [None] * 6 +        thumbnails, formats = [[]] * 2 +        subtitles = {}          for media in config.findall('./player/stream/media'):              if media.get('type') == 'posterImage': -                thumbnail = extract_thumbnail(media) +                quality_key = qualities(('low', 'high')) +                thumbnails = [] +                for asset in media.findall('assets/asset'): +                    asset_url = xpath_text(asset, 'default/url', default=None) +                    if not asset_url: +                        continue +                    quality = asset.get('quality') +                    thumbnails.append({ +                        'url': asset_url, +                        'id': quality, +                        'preference': quality_key(quality), +                    })              elif media.get('type') == 'video': -                duration = int(media.get('duration')) -                title = media.find('title').text -                description = media.find('description').text -                # It seems assets always go from lower to better quality, so no need to sort +                title = xpath_text(media, 'title', fatal=True)                  for asset in media.findall('assets/asset'): -                    for x in asset: +                    quality = asset.get('quality') +                    height = int_or_none(self._search_regex( +                        r'^(\d+)[pP]$', quality or '', 'height', default=None)) +                    for node in asset: +                        streamer = xpath_text(node, 'streamerURI', default=None) +                        if not streamer: +                            continue +                        play_path = xpath_text(node, 'url', default=None) +                        if not play_path: +                            continue                          formats.append({ -                            'url': x.find('streamerURI').text, -                            'app': x.find('streamerURI').text.split('/', 3)[3], -                            'play_path': x.find('url').text, +                            'url': streamer, +                            'app': streamer.split('/', 3)[3], +                            'play_path': play_path,                              'rtmp_live': False, -                            'ext': 'mp4', -                            'format_id': '%s-%s' % (x.tag, asset.get('quality')), +                            'ext': 'flv', +                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag, +                            'height': height,                          }) +                self._sort_formats(formats) +                description = clean_html(xpath_text(media, 'description')) +                uploader = xpath_text(media, 'author') +                duration = int_or_none(media.get('duration')) +                for subtitle in media.findall('./subtitles/subtitle'): +                    subtitle_url = xpath_text(subtitle, 'url', default=None) +                    if not subtitle_url: +                        continue +                    lang = xpath_text(subtitle, 'lang', default='en') +                    subtitles.setdefault(lang, []).append({ +                        'url': subtitle_url, +                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(), +                    })          return {              'id': video_id,              'title': title,              'description': description, -            'thumbnail': thumbnail, +            'thumbnails': thumbnails,              'duration': duration, +            'creator': uploader,              'uploader': uploader, -            'uploader_id': uploader_id,              'formats': formats, +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 51dfc27ac..486e086bb 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,91 +4,219 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urlparse, -    compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote  from ..utils import ( -    determine_ext,      ExtractorError, -    float_or_none, +    int_or_none, +    parse_age_limit,      parse_duration, -    unified_strdate,  ) -class NRKIE(InfoExtractor): -    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' - -    _TESTS = [ -        { -            'url': 'http://www.nrk.no/video/PS*150533', -            # MD5 is unstable -            'info_dict': { -                'id': '150533', -                'ext': 'flv', -                'title': 'Dompap og andre fugler i Piip-Show', -                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', -                'duration': 263, -            } -        }, -        { -            'url': 'http://www.nrk.no/video/PS*154915', -            # MD5 is unstable -            'info_dict': { -                'id': '154915', -                'ext': 'flv', -                'title': 'Slik høres internett ut når du er blind', -                'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', -                'duration': 20, -            } -        }, -    ] +class NRKBaseIE(InfoExtractor): +    def _extract_formats(self, manifest_url, video_id, fatal=True): +        formats = [] +        formats.extend(self._extract_f4m_formats( +            manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', +            video_id, f4m_id='hds', fatal=fatal)) +        formats.extend(self._extract_m3u8_formats(manifest_url.replace( +            'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), +            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) +        return formats      def _real_extract(self, url):          video_id = self._match_id(url)          data = self._download_json( -            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, -            video_id, 'Downloading media JSON') +            'http://%s/mediaelement/%s' % (self._API_HOST, video_id), +            video_id, 'Downloading mediaelement JSON') + +        title = data.get('fullTitle') or data.get('mainTitle') or data['title'] +        video_id = data.get('id') or video_id + +        entries = [] + +        media_assets = data.get('mediaAssets') +        if media_assets and isinstance(media_assets, list): +            def video_id_and_title(idx): +                return ((video_id, title) if len(media_assets) == 1 +                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) +            for num, asset in enumerate(media_assets, 1): +                asset_url = asset.get('url') +                if not asset_url: +                    continue +                formats = self._extract_formats(asset_url, video_id, fatal=False) +                if not formats: +                    continue +                self._sort_formats(formats) +                entry_id, entry_title = video_id_and_title(num) +                duration = parse_duration(asset.get('duration')) +                subtitles = {} +                for subtitle in ('webVtt', 'timedText'): +                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) +                    if subtitle_url: +                        subtitles.setdefault('no', []).append({ +                            'url': compat_urllib_parse_unquote(subtitle_url) +                        }) +                entries.append({ +                    'id': asset.get('carrierId') or entry_id, +                    'title': entry_title, +                    'duration': duration, +                    'subtitles': subtitles, +                    'formats': formats, +                }) -        media_url = data.get('mediaUrl') +        if not entries: +            media_url = data.get('mediaUrl') +            if media_url: +                formats = self._extract_formats(media_url, video_id) +                self._sort_formats(formats) +                duration = parse_duration(data.get('duration')) +                entries = [{ +                    'id': video_id, +                    'title': title, +                    'duration': duration, +                    'formats': formats, +                }] -        if not media_url: -            if data['usageRights']['isGeoBlocked']: +        if not entries: +            if data.get('usageRights', {}).get('isGeoBlocked'):                  raise ExtractorError(                      'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',                      expected=True) -        if determine_ext(media_url) == 'f4m': -            formats = self._extract_f4m_formats( -                media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') -            self._sort_formats(formats) -        else: -            formats = [{ -                'url': media_url, -                'ext': 'flv', -            }] - -        duration = parse_duration(data.get('duration')) +        conviva = data.get('convivaStatistics') or {} +        series = conviva.get('seriesName') or data.get('seriesTitle') +        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') +        thumbnails = None          images = data.get('images') -        if images: -            thumbnails = images['webImages'] -            thumbnails.sort(key=lambda image: image['pixelWidth']) -            thumbnail = thumbnails[-1]['imageUrl'] -        else: -            thumbnail = None - -        return { -            'id': video_id, -            'title': data['title'], -            'description': data['description'], -            'duration': duration, -            'thumbnail': thumbnail, -            'formats': formats, +        if images and isinstance(images, dict): +            web_images = images.get('webImages') +            if isinstance(web_images, list): +                thumbnails = [{ +                    'url': image['imageUrl'], +                    'width': int_or_none(image.get('width')), +                    'height': int_or_none(image.get('height')), +                } for image in web_images if image.get('imageUrl')] + +        description = data.get('description') + +        common_info = { +            'description': description, +            'series': series, +            'episode': episode, +            'age_limit': parse_age_limit(data.get('legalAge')), +            'thumbnails': thumbnails,          } +        vcodec = 'none' if data.get('mediaType') == 'Audio' else None + +        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged + +        for entry in entries: +            entry.update(common_info) +            for f in entry['formats']: +                f['vcodec'] = vcodec + +        return self.playlist_result(entries, video_id, title, description) + + +class NRKIE(NRKBaseIE): +    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' +    _API_HOST = 'v8.psapi.nrk.no' +    _TESTS = [{ +        # video +        'url': 'http://www.nrk.no/video/PS*150533', +        'md5': '2f7f6eeb2aacdd99885f355428715cfa', +        'info_dict': { +            'id': '150533', +            'ext': 'mp4', +            'title': 'Dompap og andre fugler i Piip-Show', +            'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', +            'duration': 263, +        } +    }, { +        # audio +        'url': 'http://www.nrk.no/video/PS*154915', +        # MD5 is unstable +        'info_dict': { +            'id': '154915', +            'ext': 'flv', +            'title': 'Slik høres internett ut når du er blind', +            'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', +            'duration': 20, +        } +    }] + + +class NRKTVIE(NRKBaseIE): +    IE_DESC = 'NRK TV and NRK Radio' +    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' +    _API_HOST = 'psapi-we.nrk.no' + +    _TESTS = [{ +        'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', +        'md5': '4e9ca6629f09e588ed240fb11619922a', +        'info_dict': { +            'id': 'MUHH48000314AA', +            'ext': 'mp4', +            'title': '20 spørsmål 23.05.2014', +            'description': 'md5:bdea103bc35494c143c6a9acdd84887a', +            'duration': 1741.52, +        }, +    }, { +        'url': 'https://tv.nrk.no/program/mdfp15000514', +        'md5': '43d0be26663d380603a9cf0c24366531', +        'info_dict': { +            'id': 'MDFP15000514CA', +            'ext': 'mp4', +            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', +            'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', +            'duration': 4605.08, +        }, +    }, { +        # single playlist video +        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', +        'md5': 'adbd1dbd813edaf532b0a253780719c2', +        'info_dict': { +            'id': 'MSPO40010515-part2', +            'ext': 'flv', +            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +        }, +        'skip': 'Only works from Norway', +    }, { +        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', +        'playlist': [{ +            'md5': '9480285eff92d64f06e02a5367970a7a', +            'info_dict': { +                'id': 'MSPO40010515-part1', +                'ext': 'flv', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            }, +        }, { +            'md5': 'adbd1dbd813edaf532b0a253780719c2', +            'info_dict': { +                'id': 'MSPO40010515-part2', +                'ext': 'flv', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            }, +        }], +        'info_dict': { +            'id': 'MSPO40010515', +            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', +            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            'duration': 6947.52, +        }, +        'skip': 'Only works from Norway', +    }, { +        'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', +        'only_matching': True, +    }] +  class NRKPlaylistIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' @@ -159,179 +287,3 @@ class NRKSkoleIE(InfoExtractor):          nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')          return self.url_result('nrk:%s' % nrk_id) - - -class NRKTVIE(InfoExtractor): -    IE_DESC = 'NRK TV and NRK Radio' -    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' - -    _TESTS = [ -        { -            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', -            'info_dict': { -                'id': 'MUHH48000314', -                'ext': 'mp4', -                'title': '20 spørsmål', -                'description': 'md5:bdea103bc35494c143c6a9acdd84887a', -                'upload_date': '20140523', -                'duration': 1741.52, -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            'url': 'https://tv.nrk.no/program/mdfp15000514', -            'info_dict': { -                'id': 'mdfp15000514', -                'ext': 'mp4', -                'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', -                'description': 'md5:654c12511f035aed1e42bdf5db3b206a', -                'upload_date': '20140524', -                'duration': 4605.08, -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            # single playlist video -            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', -            'md5': 'adbd1dbd813edaf532b0a253780719c2', -            'info_dict': { -                'id': 'MSPO40010515-part2', -                'ext': 'flv', -                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', -                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                'upload_date': '20150106', -            }, -            'skip': 'Only works from Norway', -        }, -        { -            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', -            'playlist': [ -                { -                    'md5': '9480285eff92d64f06e02a5367970a7a', -                    'info_dict': { -                        'id': 'MSPO40010515-part1', -                        'ext': 'flv', -                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', -                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                        'upload_date': '20150106', -                    }, -                }, -                { -                    'md5': 'adbd1dbd813edaf532b0a253780719c2', -                    'info_dict': { -                        'id': 'MSPO40010515-part2', -                        'ext': 'flv', -                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', -                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                        'upload_date': '20150106', -                    }, -                }, -            ], -            'info_dict': { -                'id': 'MSPO40010515', -                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', -                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                'upload_date': '20150106', -                'duration': 6947.5199999999995, -            }, -            'skip': 'Only works from Norway', -        }, -        { -            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', -            'only_matching': True, -        } -    ] - -    def _extract_f4m(self, manifest_url, video_id): -        return self._extract_f4m_formats( -            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        part_id = mobj.group('part_id') -        base_url = mobj.group('baseurl') - -        webpage = self._download_webpage(url, video_id) - -        title = self._html_search_meta( -            'title', webpage, 'title') -        description = self._html_search_meta( -            'description', webpage, 'description') - -        thumbnail = self._html_search_regex( -            r'data-posterimage="([^"]+)"', -            webpage, 'thumbnail', fatal=False) -        upload_date = unified_strdate(self._html_search_meta( -            'rightsfrom', webpage, 'upload date', fatal=False)) -        duration = float_or_none(self._html_search_regex( -            r'data-duration="([^"]+)"', -            webpage, 'duration', fatal=False)) - -        # playlist -        parts = re.findall( -            r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) -        if parts: -            entries = [] -            for current_part_id, stream_url, part_title in parts: -                if part_id and current_part_id != part_id: -                    continue -                video_part_id = '%s-part%s' % (video_id, current_part_id) -                formats = self._extract_f4m(stream_url, video_part_id) -                entries.append({ -                    'id': video_part_id, -                    'title': part_title, -                    'description': description, -                    'thumbnail': thumbnail, -                    'upload_date': upload_date, -                    'formats': formats, -                }) -            if part_id: -                if entries: -                    return entries[0] -            else: -                playlist = self.playlist_result(entries, video_id, title, description) -                playlist.update({ -                    'thumbnail': thumbnail, -                    'upload_date': upload_date, -                    'duration': duration, -                }) -                return playlist - -        formats = [] - -        f4m_url = re.search(r'data-media="([^"]+)"', webpage) -        if f4m_url: -            formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - -        m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) -        if m3u8_url: -            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) -        self._sort_formats(formats) - -        subtitles_url = self._html_search_regex( -            r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1', -            webpage, 'subtitle URL', default=None, group='url') -        subtitles = {} -        if subtitles_url: -            subtitles['no'] = [{ -                'ext': 'ttml', -                'url': compat_urlparse.urljoin(base_url, subtitles_url), -            }] - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'upload_date': upload_date, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles, -        } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 95e982897..2038a6ba5 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -8,6 +8,7 @@ from ..utils import (      float_or_none,      ExtractorError,      unsmuggle_url, +    determine_ext,  )  from ..compat import compat_urllib_parse_urlencode @@ -15,71 +16,80 @@ from ..compat import compat_urllib_parse_urlencode  class OoyalaBaseIE(InfoExtractor):      _PLAYER_BASE = 'http://player.ooyala.com/'      _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' -    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?' +    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'      def _extract(self, content_tree_url, video_id, domain='example.org'):          content_tree = self._download_json(content_tree_url, video_id)['content_tree']          metadata = content_tree[list(content_tree)[0]]          embed_code = metadata['embed_code']          pcode = metadata.get('asset_pcode') or embed_code -        video_info = { -            'id': embed_code, -            'title': metadata['title'], -            'description': metadata.get('description'), -            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), -            'duration': float_or_none(metadata.get('duration'), 1000), -        } +        title = metadata['title'] + +        auth_data = self._download_json( +            self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + +            compat_urllib_parse_urlencode({ +                'domain': domain, +                'supportedFormats': 'mp4,rtmp,m3u8,hds', +            }), video_id) + +        cur_auth_data = auth_data['authorization_data'][embed_code]          urls = []          formats = [] -        for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): -            auth_data = self._download_json( -                self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + -                compat_urllib_parse_urlencode({ -                    'domain': domain, -                    'supportedFormats': supported_format -                }), -                video_id, 'Downloading %s JSON' % supported_format) - -            cur_auth_data = auth_data['authorization_data'][embed_code] - -            if cur_auth_data['authorized']: -                for stream in cur_auth_data['streams']: -                    url = base64.b64decode( -                        stream['url']['data'].encode('ascii')).decode('utf-8') -                    if url in urls: -                        continue -                    urls.append(url) -                    delivery_type = stream['delivery_type'] -                    if delivery_type == 'hls' or '.m3u8' in url: -                        formats.extend(self._extract_m3u8_formats( -                            url, embed_code, 'mp4', 'm3u8_native', -                            m3u8_id='hls', fatal=False)) -                    elif delivery_type == 'hds' or '.f4m' in url: -                        formats.extend(self._extract_f4m_formats( -                            url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) -                    elif '.smil' in url: -                        formats.extend(self._extract_smil_formats( -                            url, embed_code, fatal=False)) -                    else: -                        formats.append({ -                            'url': url, -                            'ext': stream.get('delivery_type'), -                            'vcodec': stream.get('video_codec'), -                            'format_id': delivery_type, -                            'width': int_or_none(stream.get('width')), -                            'height': int_or_none(stream.get('height')), -                            'abr': int_or_none(stream.get('audio_bitrate')), -                            'vbr': int_or_none(stream.get('video_bitrate')), -                            'fps': float_or_none(stream.get('framerate')), -                        }) -            else: -                raise ExtractorError('%s said: %s' % ( -                    self.IE_NAME, cur_auth_data['message']), expected=True) +        if cur_auth_data['authorized']: +            for stream in cur_auth_data['streams']: +                s_url = base64.b64decode( +                    stream['url']['data'].encode('ascii')).decode('utf-8') +                if s_url in urls: +                    continue +                urls.append(s_url) +                ext = determine_ext(s_url, None) +                delivery_type = stream['delivery_type'] +                if delivery_type == 'hls' or ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        s_url, embed_code, 'mp4', 'm3u8_native', +                        m3u8_id='hls', fatal=False)) +                elif delivery_type == 'hds' or ext == 'f4m': +                    formats.extend(self._extract_f4m_formats( +                        s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) +                elif ext == 'smil': +                    formats.extend(self._extract_smil_formats( +                        s_url, embed_code, fatal=False)) +                else: +                    formats.append({ +                        'url': s_url, +                        'ext': ext or stream.get('delivery_type'), +                        'vcodec': stream.get('video_codec'), +                        'format_id': delivery_type, +                        'width': int_or_none(stream.get('width')), +                        'height': int_or_none(stream.get('height')), +                        'abr': int_or_none(stream.get('audio_bitrate')), +                        'vbr': int_or_none(stream.get('video_bitrate')), +                        'fps': float_or_none(stream.get('framerate')), +                    }) +        else: +            raise ExtractorError('%s said: %s' % ( +                self.IE_NAME, cur_auth_data['message']), expected=True)          self._sort_formats(formats) -        video_info['formats'] = formats -        return video_info +        subtitles = {} +        for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): +            sub_url = sub.get('url') +            if not sub_url: +                continue +            subtitles[lang] = [{ +                'url': sub_url, +            }] + +        return { +            'id': embed_code, +            'title': title, +            'description': metadata.get('description'), +            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), +            'duration': float_or_none(metadata.get('duration'), 1000), +            'subtitles': subtitles, +            'formats': formats, +        }  class OoyalaIE(OoyalaBaseIE): diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 456561bcc..5049b870e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -100,7 +100,7 @@ class OpenloadIE(InfoExtractor):              raise ExtractorError('File not found', expected=True)          code = self._search_regex( -            r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', +            r'</video>\s*</div>\s*<script[^>]+>([^<]+)</script>',              webpage, 'JS code')          decoded = self.openload_decode(code) diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py index 8545fb1b8..1d42be39b 100644 --- a/youtube_dl/extractor/ora.py +++ b/youtube_dl/extractor/ora.py @@ -12,8 +12,8 @@ from ..utils import (  class OraTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)' +    _TESTS = [{          'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',          'md5': 'fa33717591c631ec93b04b0e330df786',          'info_dict': { @@ -22,7 +22,10 @@ class OraTVIE(InfoExtractor):              'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',              'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',          } -    } +    }, { +        'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d', +        'only_matching': True, +    }]      def _real_extract(self, url):          display_id = self._match_id(url) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 514e9b433..c23b314e7 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -2,11 +2,15 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( +    parse_iso8601, +    unescapeHTML, +)  class PeriscopeIE(InfoExtractor):      IE_DESC = 'Periscope' +    IE_NAME = 'periscope'      _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'      # Alive example URLs can be found here http://onperiscope.com/      _TESTS = [{ @@ -41,8 +45,11 @@ class PeriscopeIE(InfoExtractor):          broadcast = broadcast_data['broadcast']          status = broadcast['status'] -        uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') -        uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') +        user = broadcast_data.get('user', {}) + +        uploader = broadcast.get('user_display_name') or user.get('display_name') +        uploader_id = (broadcast.get('username') or user.get('username') or +                       broadcast.get('user_id') or user.get('id'))          title = '%s - %s' % (uploader, status) if uploader else status          state = broadcast.get('state').lower() @@ -79,3 +86,43 @@ class PeriscopeIE(InfoExtractor):              'thumbnails': thumbnails,              'formats': formats,          } + + +class PeriscopeUserIE(InfoExtractor): +    _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' +    IE_DESC = 'Periscope user videos' +    IE_NAME = 'periscope:user' + +    _TEST = { +        'url': 'https://www.periscope.tv/LularoeHusbandMike/', +        'info_dict': { +            'id': 'LularoeHusbandMike', +            'title': 'LULAROE HUSBAND MIKE', +            'description': 'md5:6cf4ec8047768098da58e446e82c82f0', +        }, +        # Periscope only shows videos in the last 24 hours, so it's possible to +        # get 0 videos +        'playlist_mincount': 0, +    } + +    def _real_extract(self, url): +        user_id = self._match_id(url) + +        webpage = self._download_webpage(url, user_id) + +        data_store = self._parse_json( +            unescapeHTML(self._search_regex( +                r'data-store=(["\'])(?P<data>.+?)\1', +                webpage, 'data store', default='{}', group='data')), +            user_id) + +        user = data_store.get('User', {}).get('user', {}) +        title = user.get('display_name') or user.get('username') +        description = user.get('description') + +        entries = [ +            self.url_result( +                'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) +            for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + +        return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py index 6d138ef25..0bc743118 100644 --- a/youtube_dl/extractor/playwire.py +++ b/youtube_dl/extractor/playwire.py @@ -4,9 +4,8 @@ import re  from .common import InfoExtractor  from ..utils import ( -    xpath_text, +    dict_get,      float_or_none, -    int_or_none,  ) @@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor):              'duration': 145.94,          },      }, { +        # m3u8 in f4m +        'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', +        'info_dict': { +            'id': '4840492', +            'ext': 'mp4', +            'title': 'ITV EL SHOW FULL', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        # Multiple resolutions while bitrates missing          'url': 'http://cdn.playwire.com/11625/embed/85228.html',          'only_matching': True,      }, { @@ -48,25 +60,10 @@ class PlaywireIE(InfoExtractor):          thumbnail = content.get('poster')          src = content['media']['f4m'] -        f4m = self._download_xml(src, video_id) -        base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) -        formats = [] -        for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): -            media_url = media.get('url') -            if not media_url: -                continue -            tbr = int_or_none(media.get('bitrate')) -            width = int_or_none(media.get('width')) -            height = int_or_none(media.get('height')) -            f = { -                'url': '%s/%s' % (base_url, media.attrib['url']), -                'tbr': tbr, -                'width': width, -                'height': height, -            } -            if not (tbr or width or height): -                f['quality'] = 1 if '-hd.' in media_url else 0 -            formats.append(f) +        formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') +        for a_format in formats: +            if not dict_get(a_format, ['tbr', 'width', 'height']): +                a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py new file mode 100644 index 000000000..4f05bbddc --- /dev/null +++ b/youtube_dl/extractor/radiocanada.py @@ -0,0 +1,130 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    find_xpath_attr, +    determine_ext, +    int_or_none, +    unified_strdate, +    xpath_element, +    ExtractorError, +) + + +class RadioCanadaIE(InfoExtractor): +    IE_NAME = 'radiocanada' +    _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', +        'info_dict': { +            'id': '7184272', +            'ext': 'flv', +            'title': 'Le parcours du tireur capté sur vidéo', +            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', +            'upload_date': '20141023', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        app_code, video_id = re.match(self._VALID_URL, url).groups() + +        formats = [] +        # TODO: extract m3u8 and f4m formats +        # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements +        # f4m formats can be extracted using flashhd device_type but they produce unplayable file +        for device_type in ('flash',): +            v_data = self._download_xml( +                'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', +                video_id, note='Downloading %s XML' % device_type, query={ +                    'appCode': app_code, +                    'idMedia': video_id, +                    'connectionType': 'broadband', +                    'multibitrate': 'true', +                    'deviceType': device_type, +                    # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction +                    'paysJ391wsHjbOJwvCs26toz': 'CA', +                    'bypasslock': 'NZt5K62gRqfc', +                }) +            v_url = xpath_text(v_data, 'url') +            if not v_url: +                continue +            if v_url == 'null': +                raise ExtractorError('%s said: %s' % ( +                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True) +            ext = determine_ext(v_url) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) +            elif ext == 'f4m': +                formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) +            else: +                ext = determine_ext(v_url) +                bitrates = xpath_element(v_data, 'bitrates') +                for url_e in bitrates.findall('url'): +                    tbr = int_or_none(url_e.get('bitrate')) +                    if not tbr: +                        continue +                    formats.append({ +                        'format_id': 'rtmp-%d' % tbr, +                        'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), +                        'ext': 'flv', +                        'protocol': 'rtmp', +                        'width': int_or_none(url_e.get('width')), +                        'height': int_or_none(url_e.get('height')), +                        'tbr': tbr, +                    }) +        self._sort_formats(formats) + +        metadata = self._download_xml( +            'http://api.radio-canada.ca/metaMedia/v1/index.ashx', +            video_id, note='Downloading metadata XML', query={ +                'appCode': app_code, +                'idMedia': video_id, +            }) + +        def get_meta(name): +            el = find_xpath_attr(metadata, './/Meta', 'name', name) +            return el.text if el is not None else None + +        return { +            'id': video_id, +            'title': get_meta('Title'), +            'description': get_meta('Description') or get_meta('ShortDescription'), +            'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), +            'duration': int_or_none(get_meta('length')), +            'series': get_meta('Emission'), +            'season_number': int_or_none('SrcSaison'), +            'episode_number': int_or_none('SrcEpisode'), +            'upload_date': unified_strdate(get_meta('Date')), +            'formats': formats, +        } + + +class RadioCanadaAudioVideoIE(InfoExtractor): +    'radiocanada:audiovideo' +    _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', +        'info_dict': { +            'id': '7527184', +            'ext': 'flv', +            'title': 'Barack Obama au Vietnam', +            'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', +            'upload_date': '20160523', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba59..721fc3a9e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    int_or_none, +    str_to_int, +    unified_strdate, +)  class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):              'id': '66418',              'ext': 'mp4',              'title': 'Sucked on a toilet', +            'upload_date': '20120831', +            'duration': 596, +            'view_count': int,              'age_limit': 18,          }      } @@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):          if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):              raise ExtractorError('Video %s has been removed' % video_id, expected=True) -        video_url = self._html_search_regex( -            r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') -        video_title = self._html_search_regex( -            r'<h1 class="videoTitle[^"]*">(.+?)</h1>', -            webpage, 'title') -        video_thumbnail = self._og_search_thumbnail(webpage) +        title = self._html_search_regex( +            (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>', +             r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), +            webpage, 'title', group='title') + +        formats = [] +        sources = self._parse_json( +            self._search_regex( +                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), +            video_id, fatal=False) +        if sources and isinstance(sources, dict): +            for format_id, format_url in sources.items(): +                if format_url: +                    formats.append({ +                        'url': format_url, +                        'format_id': format_id, +                        'height': int_or_none(format_id), +                    }) +        else: +            video_url = self._html_search_regex( +                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') +            formats.append({'url': video_url}) +        self._sort_formats(formats) + +        thumbnail = self._og_search_thumbnail(webpage) +        upload_date = unified_strdate(self._search_regex( +            r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', +            webpage, 'upload date', fatal=False)) +        duration = int_or_none(self._search_regex( +            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) +        view_count = str_to_int(self._search_regex( +            r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', +            webpage, 'view count', fatal=False))          # No self-labeling, but they describe themselves as          # "Home of Videos Porno" @@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):          return {              'id': video_id, -            'url': video_url,              'ext': 'mp4', -            'title': video_title, -            'thumbnail': video_thumbnail, +            'title': title, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'duration': duration, +            'view_count': view_count,              'age_limit': age_limit, +            'formats': formats,          } diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py new file mode 100644 index 000000000..961d504eb --- /dev/null +++ b/youtube_dl/extractor/reuters.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    js_to_json, +    int_or_none, +    unescapeHTML, +) + + +class ReutersIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', +        'md5': '8015113643a0b12838f160b0b81cc2ee', +        'info_dict': { +            'id': '368575562', +            'ext': 'mp4', +            'title': 'San Francisco police chief resigns', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) +        video_data = js_to_json(self._search_regex( +            r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', +            webpage, 'video data')) + +        def get_json_value(key, fatal=False): +            return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + +        title = unescapeHTML(get_json_value('title', fatal=True)) +        mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + +        mas_data = self._download_json( +            'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), +            video_id, transform_source=js_to_json) +        formats = [] +        for f in mas_data: +            f_url = f.get('url') +            if not f_url: +                continue +            method = f.get('method') +            if method == 'hls': +                formats.extend(self._extract_m3u8_formats( +                    f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +            else: +                container = f.get('container') +                ext = '3gp' if method == 'mobile' else container +                formats.append({ +                    'format_id': ext, +                    'url': f_url, +                    'ext': ext, +                    'container': container if method != 'mobile' else None, +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': get_json_value('thumb'), +            'duration': int_or_none(get_json_value('seconds')), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index 99979ebe1..833d8a2f0 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -13,8 +13,64 @@ from ..utils import (  ) +class Revision3EmbedIE(InfoExtractor): +    IE_NAME = 'revision3:embed' +    _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)' +    _TEST = { +        'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', +        'md5': '83bcd157cab89ad7318dd7b8c9cf1306', +        'info_dict': { +            'id': '67558', +            'ext': 'mp4', +            'title': 'The Pros & Cons Of Zoos', +            'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', +            'uploader_id': 'dnews', +            'uploader': 'DNews', +        } +    } +    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        playlist_id = mobj.group('playlist_id') +        playlist_type = mobj.group('playlist_type') or 'video_id' +        video_data = self._download_json( +            'http://revision3.com/api/getPlaylist.json', playlist_id, query={ +                'api_key': self._API_KEY, +                'codecs': 'h264,vp8,theora', +                playlist_type: playlist_id, +            })['items'][0] + +        formats = [] +        for vcodec, media in video_data['media'].items(): +            for quality_id, quality in media.items(): +                if quality_id == 'hls': +                    formats.extend(self._extract_m3u8_formats( +                        quality['url'], playlist_id, 'mp4', +                        'm3u8_native', m3u8_id='hls', fatal=False)) +                else: +                    formats.append({ +                        'url': quality['url'], +                        'format_id': '%s-%s' % (vcodec, quality_id), +                        'tbr': int_or_none(quality.get('bitrate')), +                        'vcodec': vcodec, +                    }) +        self._sort_formats(formats) + +        return { +            'id': playlist_id, +            'title': unescapeHTML(video_data['title']), +            'description': unescapeHTML(video_data.get('summary')), +            'uploader': video_data.get('show', {}).get('name'), +            'uploader_id': video_data.get('show', {}).get('slug'), +            'duration': int_or_none(video_data.get('duration')), +            'formats': formats, +        } + +  class Revision3IE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' +    IE_NAME = 'revision' +    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'      _TESTS = [{          'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',          'md5': 'd94a72d85d0a829766de4deb8daaf7df', @@ -32,52 +88,14 @@ class Revision3IE(InfoExtractor):          }      }, {          # Show -        'url': 'http://testtube.com/brainstuff', -        'info_dict': { -            'id': '251', -            'title': 'BrainStuff', -            'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', -        }, -        'playlist_mincount': 93, -    }, { -        'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', -        'info_dict': { -            'id': '58227', -            'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', -            'duration': 275, -            'ext': 'webm', -            'title': '5 Weird Ways Plants Can Eat Animals', -            'description': 'Why have some plants evolved to eat meat?', -            'upload_date': '20150120', -            'timestamp': 1421763300, -            'uploader': 'DNews', -            'uploader_id': 'dnews', -        }, -    }, { -        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', -        'info_dict': { -            'id': '71618', -            'ext': 'mp4', -            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', -            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', -            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', -            'uploader': 'Editors\' Picks', -            'uploader_id': 'tt-editors-picks', -            'timestamp': 1453309200, -            'upload_date': '20160120', -        }, -        'add_ie': ['Youtube'], +        'url': 'http://revision3.com/variant', +        'only_matching': True,      }, {          # Tag -        'url': 'http://testtube.com/tech-news', -        'info_dict': { -            'id': '21018', -            'title': 'tech news', -        }, -        'playlist_mincount': 9, +        'url': 'http://revision3.com/vr', +        'only_matching': True,      }]      _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' -    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'      def _real_extract(self, url):          domain, display_id = re.match(self._VALID_URL, url).groups() @@ -119,33 +137,9 @@ class Revision3IE(InfoExtractor):                  })                  return info -            video_data = self._download_json( -                'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), -                video_id)['items'][0] - -            formats = [] -            for vcodec, media in video_data['media'].items(): -                for quality_id, quality in media.items(): -                    if quality_id == 'hls': -                        formats.extend(self._extract_m3u8_formats( -                            quality['url'], video_id, 'mp4', -                            'm3u8_native', m3u8_id='hls', fatal=False)) -                    else: -                        formats.append({ -                            'url': quality['url'], -                            'format_id': '%s-%s' % (vcodec, quality_id), -                            'tbr': int_or_none(quality.get('bitrate')), -                            'vcodec': vcodec, -                        }) -            self._sort_formats(formats) -              info.update({ -                'title': unescapeHTML(video_data['title']), -                'description': unescapeHTML(video_data.get('summary')), -                'uploader': video_data.get('show', {}).get('name'), -                'uploader_id': video_data.get('show', {}).get('slug'), -                'duration': int_or_none(video_data.get('duration')), -                'formats': formats, +                '_type': 'url_transparent', +                'url': 'revision3:%s' % video_id,              })              return info          else: diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 79af47715..f11e3588b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,9 @@ import re  import time  from .common import InfoExtractor +from ..compat import ( +    compat_struct_unpack, +)  from ..utils import (      ExtractorError,      float_or_none, @@ -13,7 +16,6 @@ from ..utils import (      remove_start,      sanitized_Request,      std_headers, -    struct_unpack,  ) @@ -21,7 +23,7 @@ def _decrypt_url(png):      encrypted_data = base64.b64decode(png.encode('utf-8'))      text_index = encrypted_data.find(b'tEXt')      text_chunk = encrypted_data[text_index - 4:] -    length = struct_unpack('!I', text_chunk[:4])[0] +    length = compat_struct_unpack('!I', text_chunk[:4])[0]      # Use bytearray to get integers when iterating in both python 2.x and 3.x      data = bytearray(text_chunk[8:8 + length])      data = [chr(b) for b in data if b != 0] @@ -62,7 +64,7 @@ def _decrypt_url(png):  class RTVEALaCartaIE(InfoExtractor):      IE_NAME = 'rtve.es:alacarta'      IE_DESC = 'RTVE a la carta' -    _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' +    _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -85,6 +87,9 @@ class RTVEALaCartaIE(InfoExtractor):      }, {          'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',          'only_matching': True, +    }, { +        'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', +        'only_matching': True,      }]      def _real_initialize(self): diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py new file mode 100644 index 000000000..3b9c65e7e --- /dev/null +++ b/youtube_dl/extractor/seeker.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SeekerIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' +    _TESTS = [{ +        # player.loadRevision3Item +        'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', +        'md5': '30c1dc4030cc715cf05b423d0947ac18', +        'info_dict': { +            'id': '76243', +            'ext': 'webm', +            'title': 'Should Trump Be Required To Release His Tax Returns?', +            'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', +            'uploader': 'Seeker Daily', +            'uploader_id': 'seekerdaily', +        } +    }, { +        'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', +        'playlist': [ +            { +                'md5': '83bcd157cab89ad7318dd7b8c9cf1306', +                'info_dict': { +                    'id': '67558', +                    'ext': 'mp4', +                    'title': 'The Pros & Cons Of Zoos', +                    'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', +                    'uploader': 'DNews', +                    'uploader_id': 'dnews', +                }, +            } +        ], +        'info_dict': { +            'id': '1834116536', +            'title': 'After Gorilla Killing, Changes Ahead for Zoos', +            'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', +        }, +    }] + +    def _real_extract(self, url): +        display_id, article_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, display_id) +        mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) +        if mobj: +            playlist_type, playlist_id = mobj.groups() +            return self.url_result( +                'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) +        else: +            entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( +                r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] +            return self.playlist_result( +                entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py new file mode 100644 index 000000000..1c636f672 --- /dev/null +++ b/youtube_dl/extractor/sendtonews.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .jwplatform import JWPlatformBaseIE +from ..compat import compat_parse_qs +from ..utils import ( +    ExtractorError, +    parse_duration, +) + + +class SendtoNewsIE(JWPlatformBaseIE): +    _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)' + +    _TEST = { +        # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ +        'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', +        'info_dict': { +            'id': 'GxfCe0Zo7D-175909-5588', +            'ext': 'mp4', +            'title': 'Recap: CLE 15, CIN 6', +            'description': '5/16/16: Indians\' bats explode for 15 runs in a win', +            'duration': 49, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    } + +    _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + +    @classmethod +    def _extract_url(cls, webpage): +        mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) +            (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? +                .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* +            \1>''', webpage) +        if mobj: +            sk, mk, pk = mobj.group('SC').split('-') +            return cls._URL_TEMPLATE % (sk, mk, pk) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        params = compat_parse_qs(mobj.group('query')) + +        if 'SK' not in params or 'MK' not in params or 'PK' not in params: +            raise ExtractorError('Invalid URL', expected=True) + +        video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) + +        webpage = self._download_webpage(url, video_id) + +        jwplayer_data_str = self._search_regex( +            r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') +        js_vars = { +            'w': 1024, +            'h': 768, +            'modeVar': 'html5', +        } +        for name, val in js_vars.items(): +            js_val = '%d' % val if isinstance(val, int) else '"%s"' % val +            jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) + +        info_dict = self._parse_jwplayer_data( +            self._parse_json(jwplayer_data_str, video_id), +            video_id, require_title=False, rtmp_params={'no_resume': True}) + +        title = self._html_search_regex( +            r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title') +        description = self._html_search_regex( +            r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage, +            'description', fatal=False) +        duration = parse_duration(self._html_search_regex( +            r'<div[^>]+class="embedDetails">([0-9:]+)', webpage, +            'duration', fatal=False)) + +        info_dict.update({ +            'title': title, +            'description': description, +            'duration': duration, +        }) + +        return info_dict diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index d03f1b1d4..8fc66732a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,28 +4,35 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import sanitized_Request +from ..utils import ( +    HEADRequest, +    ExtractorError, +    int_or_none, +    update_url_query, +    qualities, +    get_element_by_attribute, +    clean_html, +)  class SinaIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/ -                        ( -                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-)))) -                            | +    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ +                        (?: +                            (?:view/|.*\#)(?P<video_id>\d+)| +                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|                              # This is used by external sites like Weibo -                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf) +                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf                          )                    '''      _TESTS = [          { -            'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', -            'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', +            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', +            'md5': 'd38433e2fc886007729735650ae4b3e9',              'info_dict': { -                'id': '110028898', -                'ext': 'flv', -                'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', +                'id': '250576622', +                'ext': 'mp4', +                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',              }          },          { @@ -35,37 +42,74 @@ class SinaIE(InfoExtractor):                  'ext': 'flv',                  'title': '军方提高对朝情报监视级别',              }, +            'skip': 'the page does not exist or has been deleted', +        }, +        { +            'url': 'http://video.sina.com.cn/view/250587748.html', +            'md5': '3d1807a25c775092aab3bc157fff49b4', +            'info_dict': { +                'id': '250587748', +                'ext': 'mp4', +                'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', +            },          },      ] -    def _extract_video(self, video_id): -        data = compat_urllib_parse_urlencode({'vid': video_id}) -        url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, -                                     video_id, 'Downloading video url') -        image_page = self._download_webpage( -            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, -            video_id, 'Downloading thumbnail info') - -        return {'id': video_id, -                'url': url_doc.find('./durl/url').text, -                'ext': 'flv', -                'title': url_doc.find('./vname').text, -                'thumbnail': image_page.split('=')[1], -                } -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        if mobj.group('token') is not None: -            # The video id is in the redirected url -            self.to_screen('Getting video id') -            request = sanitized_Request(url) -            request.get_method = lambda: 'HEAD' -            (_, urlh) = self._download_webpage_handle(request, 'NA', False) -            return self._real_extract(urlh.geturl()) -        elif video_id is None: -            pseudo_id = mobj.group('pseudo_id') -            webpage = self._download_webpage(url, pseudo_id) -            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') -        return self._extract_video(video_id) +        video_id = mobj.group('video_id') +        if not video_id: +            if mobj.group('token') is not None: +                # The video id is in the redirected url +                self.to_screen('Getting video id') +                request = HEADRequest(url) +                (_, urlh) = self._download_webpage_handle(request, 'NA', False) +                return self._real_extract(urlh.geturl()) +            else: +                pseudo_id = mobj.group('pseudo_id') +                webpage = self._download_webpage(url, pseudo_id) +                error = get_element_by_attribute('class', 'errtitle', webpage) +                if error: +                    raise ExtractorError('%s said: %s' % ( +                        self.IE_NAME, clean_html(error)), expected=True) +                video_id = self._search_regex( +                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + +        video_data = self._download_json( +            'http://s.video.sina.com.cn/video/h5play', +            video_id, query={'video_id': video_id}) +        if video_data['code'] != 1: +            raise ExtractorError('%s said: %s' % ( +                self.IE_NAME, video_data['message']), expected=True) +        else: +            video_data = video_data['data'] +            title = video_data['title'] +            description = video_data.get('description') +            if description: +                description = description.strip() + +            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) +            formats = [] +            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): +                file_api = quality.get('file_api') +                file_id = quality.get('file_id') +                if not file_api or not file_id: +                    continue +                formats.append({ +                    'format_id': quality_id, +                    'url': update_url_query(file_api, {'vid': file_id}), +                    'preference': preference(quality_id), +                    'ext': 'mp4', +                }) +            self._sort_formats(formats) + +            return { +                'id': video_id, +                'title': title, +                'description': description, +                'thumbnail': video_data.get('image'), +                'duration': int_or_none(video_data.get('length')), +                'timestamp': int_or_none(video_data.get('create_time')), +                'formats': formats, +            } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 692fd78e8..92a7120a3 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -96,20 +96,18 @@ class SpankwireIE(InfoExtractor):          formats = []          for height, video_url in zip(heights, video_urls):              path = compat_urllib_parse_urlparse(video_url).path -            _, quality = path.split('/')[4].split('_')[:2] -            f = { +            m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path) +            if m: +                tbr = int(m.group('tbr')) +                height = int(m.group('height')) +            else: +                tbr = None +            formats.append({                  'url': video_url, +                'format_id': '%dp' % height,                  'height': height, -            } -            tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) -            if tbr: -                f.update({ -                    'tbr': int(tbr), -                    'format_id': '%dp' % height, -                }) -            else: -                f['format_id'] = quality -            formats.append(f) +                'tbr': tbr, +            })          self._sort_formats(formats)          age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index e0477382c..d14d93e3a 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor):      _TEST = {          'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', +        'md5': '3d6361864d7cac20b57c8784da17166f',          'info_dict': {              'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',              'ext': 'mp4', @@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor):              'duration': 422.255,          },          'params': { -            # m3u8 download              'skip_download': True,          }, +        'add_ie': ['Ooyala'],      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index b49ab5f5b..79a778920 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -88,7 +88,7 @@ class TeamcocoIE(InfoExtractor):          preload_codes = self._html_search_regex(              r'(function.+)setTimeout\(function\(\)\{playlist',              webpage, 'preload codes') -        base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) +        base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes)          base64_fragments.remove('init')          def _check_sequence(cur_fragments): diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( +    determine_ext, +    remove_end, +)  class TelegraafIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'      _TEST = {          'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', -        'md5': '83245a9779bcc4a24454bfd53c65b6dc',          'info_dict': {              'id': '24353229',              'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):              'thumbnail': 're:^https?://.*\.jpg$',              'duration': 33,          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }      def _real_extract(self, url): -        playlist_id = self._match_id(url) +        video_id = self._match_id(url) -        webpage = self._download_webpage(url, playlist_id) +        webpage = self._download_webpage(url, video_id) +        player_url = self._html_search_regex( +            r'<iframe[^>]+src="([^"]+")', webpage, 'player URL') +        player_page = self._download_webpage( +            player_url, video_id, note='Download player webpage')          playlist_url = self._search_regex( -            r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') +            r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') +        playlist_data = self._download_json(playlist_url, video_id) + +        item = playlist_data['items'][0] +        formats = [] +        locations = item['locations'] +        for location in locations.get('adaptive', []): +            manifest_url = location['src'] +            ext = determine_ext(manifest_url) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    manifest_url, video_id, ext='mp4', m3u8_id='hls')) +            elif ext == 'mpd': +                # TODO: Current DASH formats are broken - $Time$ pattern in +                # <SegmentTemplate> not implemented yet +                continue +            else: +                self.report_warning('Unknown adaptive format %s' % ext) +        for location in locations.get('progressive', []): +            formats.append({ +                'url': location['sources'][0]['src'], +                'width': location.get('width'), +                'height': location.get('height'), +                'format_id': 'http-%s' % location['label'], +            }) + +        self._sort_formats(formats) -        entries = self._extract_xspf_playlist(playlist_url, playlist_id)          title = remove_end(self._og_search_title(webpage), ' - VIDEO')          description = self._og_search_description(webpage) +        duration = item.get('duration') +        thumbnail = item.get('poster') -        return self.playlist_result(entries, playlist_id, title, description) +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'formats': formats, +            'duration': duration, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 3f54b2744..6c848dc6f 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor  class TF1IE(InfoExtractor):      """TF1 uses the wat.tv player.""" -    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' +    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'      _TESTS = [{          'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',          'info_dict': { @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          wat_id = self._html_search_regex( -            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1', +            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1',              webpage, 'wat id', group='id')          return self.url_result('wat:%s' % wat_id, 'Wat') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index a25417f94..5793ec6ef 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -14,11 +14,13 @@ from ..compat import (      compat_urllib_parse_urlparse,  )  from ..utils import ( +    determine_ext,      ExtractorError,      float_or_none,      int_or_none,      sanitized_Request,      unsmuggle_url, +    update_url_query,      xpath_with_ns,      mimetype2ext,      find_xpath_attr, @@ -48,6 +50,12 @@ class ThePlatformBaseIE(OnceIE):              if OnceIE.suitable(_format['url']):                  formats.extend(self._extract_once_formats(_format['url']))              else: +                media_url = _format['url'] +                if determine_ext(media_url) == 'm3u8': +                    hdnea2 = self._get_cookies(media_url).get('hdnea2') +                    if hdnea2: +                        _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) +                  formats.append(_format)          subtitles = self._parse_smil_subtitles(meta, default_ns) @@ -151,6 +159,22 @@ class ThePlatformIE(ThePlatformBaseIE):          'only_matching': True,      }] +    @classmethod +    def _extract_urls(cls, webpage): +        m = re.search( +            r'''(?x) +                    <meta\s+ +                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ +                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 +            ''', webpage) +        if m: +            return [m.group('url')] + +        matches = re.findall( +            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) +        if matches: +            return list(zip(*matches))[1] +      @staticmethod      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):          flags = '10' if include_qs else '00' diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py index d8b1fd281..d63aef5de 100644 --- a/youtube_dl/extractor/thesixtyone.py +++ b/youtube_dl/extractor/thesixtyone.py @@ -12,7 +12,7 @@ class TheSixtyOneIE(InfoExtractor):              s|              song/comments/list|              song -        )/(?P<id>[A-Za-z0-9]+)/?$''' +        )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$'''      _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'      _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'      _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' @@ -45,6 +45,10 @@ class TheSixtyOneIE(InfoExtractor):              'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',              'only_matching': True,          }, +        { +            'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/', +            'only_matching': True, +        },      ]      _DECODE_MAP = { diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py new file mode 100644 index 000000000..c77a07989 --- /dev/null +++ b/youtube_dl/extractor/threeqsdn.py @@ -0,0 +1,139 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    js_to_json, +    mimetype2ext, +) + + +class ThreeQSDNIE(InfoExtractor): +    IE_NAME = '3qsdn' +    IE_DESC = '3Q SDN' +    _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _TESTS = [{ +        # ondemand from http://www.philharmonie.tv/veranstaltung/26/ +        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', +        'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', +        'info_dict': { +            'id': '0280d6b9-1215-11e6-b427-0cc47a188158', +            'ext': 'mp4', +            'title': '0280d6b9-1215-11e6-b427-0cc47a188158', +            'is_live': False, +        }, +        'expected_warnings': ['Failed to download MPD manifest'], +    }, { +        # live video stream +        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', +        'info_dict': { +            'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', +            'ext': 'mp4', +            'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f', +            'is_live': False, +        }, +    }, { +        # live audio stream +        'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', +        'only_matching': True, +    }, { +        # live audio stream with some 404 URLs +        'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48', +        'only_matching': True, +    }, { +        # geo restricted with 'This content is not available in your country' +        'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48', +        'only_matching': True, +    }, { +        # geo restricted with 'playout.3qsdn.com/forbidden' +        'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48', +        'only_matching': True, +    }, { +        # live video with rtmp link +        'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) +        if mobj: +            return mobj.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        js = self._download_webpage( +            'http://playout.3qsdn.com/%s' % video_id, video_id, +            query={'js': 'true'}) + +        if any(p in js for p in ( +                '>This content is not available in your country', +                'playout.3qsdn.com/forbidden')): +            self.raise_geo_restricted() + +        stream_content = self._search_regex( +            r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js, +            'stream content', default='demand', group='content') + +        live = stream_content == 'live' + +        stream_type = self._search_regex( +            r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js, +            'stream type', default='video', group='type') + +        formats = [] +        urls = set() + +        def extract_formats(item_url, item={}): +            if not item_url or item_url in urls: +                return +            urls.add(item_url) +            type_ = item.get('type') +            ext = determine_ext(item_url, default_ext=None) +            if type_ == 'application/dash+xml' or ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    item_url, video_id, mpd_id='mpd', fatal=False)) +            elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    item_url, video_id, 'mp4', +                    entry_protocol='m3u8' if live else 'm3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif ext == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    item_url, video_id, f4m_id='hds', fatal=False)) +            else: +                if not self._is_valid_url(item_url, video_id): +                    return +                formats.append({ +                    'url': item_url, +                    'format_id': item.get('quality'), +                    'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, +                    'vcodec': 'none' if stream_type == 'audio' else None, +                }) + +        for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js): +            f = self._parse_json( +                item_js, video_id, transform_source=js_to_json, fatal=False) +            if not f: +                continue +            extract_formats(f.get('src'), f) + +        # More relaxed version to collect additional URLs and acting +        # as a future-proof fallback +        for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): +            extract_formats(src) + +        self._sort_formats(formats) + +        title = self._live_title(video_id) if live else video_id + +        return { +            'id': video_id, +            'title': title, +            'is_live': live, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f57d609d4..a4997cb89 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8  from __future__ import unicode_literals  import re @@ -6,20 +6,13 @@ import re  from .common import InfoExtractor -class TvpIE(InfoExtractor): -    IE_NAME = 'tvp.pl' -    _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$' +class TVPIE(InfoExtractor): +    IE_NAME = 'tvp' +    IE_DESC = 'Telewizja Polska' +    _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'      _TESTS = [{ -        'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035', -        'md5': 'cdd98303338b8a7f7abab5cd14092bf2', -        'info_dict': { -            'id': '4278035', -            'ext': 'wmv', -            'title': 'Ogniem i mieczem, odc. 2', -        }, -    }, { -        'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536', +        'url': 'http://vod.tvp.pl/194536/i-seria-odc-13',          'md5': '8aa518c15e5cc32dfe8db400dc921fbb',          'info_dict': {              'id': '194536', @@ -36,12 +29,22 @@ class TvpIE(InfoExtractor):          },      }, {          'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', -        'md5': 'c3b15ed1af288131115ff17a17c19dda', -        'info_dict': { -            'id': '17834272', -            'ext': 'mp4', -            'title': 'Na sygnale, odc. 39', -        }, +        'only_matching': True, +    }, { +        'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', +        'only_matching': True, +    }, { +        'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', +        'only_matching': True, +    }, { +        'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', +        'only_matching': True, +    }, { +        'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', +        'only_matching': True, +    }, { +        'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -92,8 +95,8 @@ class TvpIE(InfoExtractor):          } -class TvpSeriesIE(InfoExtractor): -    IE_NAME = 'tvp.pl:Series' +class TVPSeriesIE(InfoExtractor): +    IE_NAME = 'tvp:series'      _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'      _TESTS = [{ @@ -127,7 +130,7 @@ class TvpSeriesIE(InfoExtractor):          videos_paths = re.findall(              '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)          entries = [ -            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key()) +            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())              for v_path in videos_paths]          return { diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index e03e2dbaa..4025edf02 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -47,7 +47,8 @@ class TwentyFourVideoIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._html_search_regex( -            r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False) +            r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', +            webpage, 'description', fatal=False, group='description')          thumbnail = self._og_search_thumbnail(webpage)          duration = int_or_none(self._og_search_property(              'duration', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 36ee1adff..d898f14c3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -171,6 +171,7 @@ class TwitchVideoIE(TwitchItemBaseIE):              'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',          },          'playlist_mincount': 12, +        'skip': 'HTTP Error 404: Not Found',      } @@ -187,6 +188,7 @@ class TwitchChapterIE(TwitchItemBaseIE):              'title': 'ACRL Off Season - Sports Cars @ Nordschleife',          },          'playlist_mincount': 3, +        'skip': 'HTTP Error 404: Not Found',      }, {          'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',          'only_matching': True, @@ -258,7 +260,7 @@ class TwitchVodIE(TwitchItemBaseIE):                      'nauth': access_token['token'],                      'nauthsig': access_token['sig'],                  })), -            item_id, 'mp4') +            item_id, 'mp4', entry_protocol='m3u8_native')          self._prefer_source(formats)          info['formats'] = formats @@ -355,31 +357,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):      } -class TwitchBookmarksIE(TwitchPlaylistBaseIE): -    IE_NAME = 'twitch:bookmarks' -    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE -    _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE -    _PLAYLIST_TYPE = 'bookmarks' - -    _TEST = { -        'url': 'http://www.twitch.tv/ognos/profile/bookmarks', -        'info_dict': { -            'id': 'ognos', -            'title': 'Ognos', -        }, -        'playlist_mincount': 3, -    } - -    def _extract_playlist_page(self, response): -        entries = [] -        for bookmark in response.get('bookmarks', []): -            video = bookmark.get('video') -            if not video: -                continue -            entries.append(video['url']) -        return entries - -  class TwitchStreamIE(TwitchBaseIE):      IE_NAME = 'twitch:stream'      _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ea673054f..b73842986 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      float_or_none,      xpath_text,      remove_end, @@ -52,7 +53,7 @@ class TwitterCardIE(TwitterBaseIE):                  'id': 'dq4Oj5quskI',                  'ext': 'mp4',                  'title': 'Ubuntu 11.10 Overview', -                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', +                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...',                  'upload_date': '20111013',                  'uploader': 'OMG! Ubuntu!',                  'uploader_id': 'omgubuntu', @@ -116,13 +117,16 @@ class TwitterCardIE(TwitterBaseIE):          video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')          if video_url: -            f = { -                'url': video_url, -            } +            if determine_ext(video_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) +            else: +                f = { +                    'url': video_url, +                } -            _search_dimensions_in_video_url(f, video_url) +                _search_dimensions_in_video_url(f, video_url) -            formats.append(f) +                formats.append(f)          vmap_url = config.get('vmapUrl') or config.get('vmap_url')          if vmap_url: @@ -207,6 +211,7 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'giphz',          },          'expected_warnings': ['height', 'width'], +        'skip': 'Account suspended',      }, {          'url': 'https://twitter.com/starwars/status/665052190608723968',          'md5': '39b7199856dee6cd4432e72c74bc69d4', @@ -239,10 +244,10 @@ class TwitterIE(InfoExtractor):          'info_dict': {              'id': '700207533655363584',              'ext': 'mp4', -            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', -            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', +            'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel', +            'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',              'thumbnail': 're:^https?://.*\.jpg', -            'uploader': 'jay', +            'uploader': 'Donte The Dumbass',              'uploader_id': 'jaydingeer',          },          'params': { @@ -262,7 +267,6 @@ class TwitterIE(InfoExtractor):          'add_ie': ['Vine'],      }, {          'url': 'https://twitter.com/captainamerica/status/719944021058060289', -        # md5 constantly changes          'info_dict': {              'id': '719944021058060289',              'ext': 'mp4', @@ -271,6 +275,9 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'captainamerica',              'uploader': 'Captain America',          }, +        'params': { +            'skip_download': True,  # requires ffmpeg +        },      }]      def _real_extract(self, url): @@ -278,7 +285,11 @@ class TwitterIE(InfoExtractor):          user_id = mobj.group('user_id')          twid = mobj.group('id') -        webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) +        webpage, urlh = self._download_webpage_handle( +            self._TEMPLATE_URL % (user_id, twid), twid) + +        if 'twitter.com/account/suspended' in urlh.geturl(): +            raise ExtractorError('Account suspended by Twitter.', expected=True)          username = remove_end(self._og_search_title(webpage), ' on Twitter') diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index d1e6f2703..89b869559 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,7 +5,6 @@ import re  from .common import InfoExtractor  from ..compat import (      compat_HTTPError, -    compat_urllib_parse_urlencode,      compat_urllib_request,      compat_urlparse,  ) @@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor):          if enroll_url:              webpage = self._download_webpage(                  combine_url(base_url, enroll_url), -                course_id, 'Enrolling in the course') +                course_id, 'Enrolling in the course', +                headers={'Referer': base_url})              if '>You have enrolled in' in webpage:                  self.to_screen('%s: Successfully enrolled in the course' % course_id)      def _download_lecture(self, course_id, lecture_id):          return self._download_json( -            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( -                course_id, lecture_id, compat_urllib_parse_urlencode({ -                    'fields[lecture]': 'title,description,view_html,asset', -                    'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', -                })), -            lecture_id, 'Downloading lecture JSON') +            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' +            % (course_id, lecture_id), +            lecture_id, 'Downloading lecture JSON', query={ +                'fields[lecture]': 'title,description,view_html,asset', +                'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', +            })      def _handle_error(self, response):          if not isinstance(response, dict): @@ -142,7 +142,9 @@ class UdemyIE(InfoExtractor):              self._LOGIN_URL, None, 'Downloading login popup')          def is_logged(webpage): -            return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) +            return any(re.search(p, webpage) for p in ( +                r'href=["\'](?:https://www\.udemy\.com)?/user/logout/', +                r'>Logout<'))          # already logged in          if is_logged(login_popup): @@ -155,13 +157,13 @@ class UdemyIE(InfoExtractor):              'password': password,          }) -        request = sanitized_Request( -            self._LOGIN_URL, urlencode_postdata(login_form)) -        request.add_header('Referer', self._ORIGIN_URL) -        request.add_header('Origin', self._ORIGIN_URL) -          response = self._download_webpage( -            request, None, 'Logging in as %s' % username) +            self._LOGIN_URL, None, 'Logging in as %s' % username, +            data=urlencode_postdata(login_form), +            headers={ +                'Referer': self._ORIGIN_URL, +                'Origin': self._ORIGIN_URL, +            })          if not is_logged(response):              error = self._html_search_regex( diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index ee35b7227..57dd73aef 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -2,10 +2,13 @@  from __future__ import unicode_literals  import json +import re +  from .common import InfoExtractor  from ..utils import ( +    determine_ext, +    int_or_none,      js_to_json, -    ExtractorError,  )  from ..compat import compat_urlparse @@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor):      _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL      _TESTS = [{          'url': 'http://video.udn.com/embed/news/300040', -        'md5': 'de06b4c90b042c128395a88f0384817e',          'info_dict': {              'id': '300040',              'ext': 'mp4',              'title': '生物老師男變女 全校挺"做自己"',              'thumbnail': 're:^https?://.*\.jpg$', -        } +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'https://video.udn.com/embed/news/300040',          'only_matching': True, @@ -38,39 +44,53 @@ class UDNEmbedIE(InfoExtractor):          page = self._download_webpage(url, video_id)          options = json.loads(js_to_json(self._html_search_regex( -            r'var options\s*=\s*([^;]+);', page, 'video urls dictionary'))) +            r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary')))          video_urls = options['video']          if video_urls.get('youtube'):              return self.url_result(video_urls.get('youtube'), 'Youtube') -        try: -            del video_urls['youtube'] -        except KeyError: -            pass +        formats = [] +        for video_type, api_url in video_urls.items(): +            if not api_url: +                continue -        formats = [{ -            'url': self._download_webpage( +            video_url = self._download_webpage(                  compat_urlparse.urljoin(url, api_url), video_id, -                'retrieve url for %s video' % video_type), -            'format_id': video_type, -            'preference': 0 if video_type == 'mp4' else -1, -        } for video_type, api_url in video_urls.items() if api_url] +                note='retrieve url for %s video' % video_type) -        if not formats: -            raise ExtractorError('No videos found', expected=True) +            ext = determine_ext(video_url) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    video_url, video_id, ext='mp4', m3u8_id='hls')) +            elif ext == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    video_url, video_id, f4m_id='hds')) +            else: +                mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+).mp4', video_url) +                a_format = { +                    'url': video_url, +                    # video_type may be 'mp4', which confuses YoutubeDL +                    'format_id': 'http-' + video_type, +                } +                if mobj: +                    a_format.update({ +                        'height': int_or_none(mobj.group('height')), +                        'tbr': int_or_none(mobj.group('tbr')), +                    }) +                formats.append(a_format)          self._sort_formats(formats) -        thumbnail = None - -        if options.get('gallery') and len(options['gallery']): -            thumbnail = options['gallery'][0].get('original') +        thumbnails = [{ +            'url': img_url, +            'id': img_type, +        } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url]          return {              'id': video_id,              'formats': formats,              'title': options['title'], -            'thumbnail': thumbnail +            'thumbnails': thumbnails,          } diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index cafc082b6..3484a2046 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -6,10 +6,12 @@ from .common import InfoExtractor  from ..utils import (      int_or_none,      unified_strdate, +    unescapeHTML,  )  class UstudioIE(InfoExtractor): +    IE_NAME = 'ustudio'      _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'      _TEST = {          'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', @@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        display_id = mobj.group('display_id') +        video_id, display_id = re.match(self._VALID_URL, url).groups()          config = self._download_xml(              'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, @@ -37,7 +37,7 @@ class UstudioIE(InfoExtractor):          def extract(kind):              return [{ -                'url': item.attrib['url'], +                'url': unescapeHTML(item.attrib['url']),                  'width': int_or_none(item.get('width')),                  'height': int_or_none(item.get('height')),              } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] @@ -65,3 +65,61 @@ class UstudioIE(InfoExtractor):              'uploader': uploader,              'formats': formats,          } + + +class UstudioEmbedIE(InfoExtractor): +    IE_NAME = 'ustudio:embed' +    _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' +    _TEST = { +        'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', +        'md5': '47c0be52a09b23a7f40de9469cec58f4', +        'info_dict': { +            'id': 'Uw7G1kMCe65T', +            'ext': 'mp4', +            'title': '5 Things IT Should Know About Video', +            'description': 'md5:93d32650884b500115e158c5677d25ad', +            'uploader_id': 'DeN7VdYRDKhP', +        } +    } + +    def _real_extract(self, url): +        uploader_id, video_id = re.match(self._VALID_URL, url).groups() +        video_data = self._download_json( +            'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), +            video_id)['videos'][0] +        title = video_data['name'] + +        formats = [] +        for ext, qualities in video_data.get('transcodes', {}).items(): +            for quality in qualities: +                quality_url = quality.get('url') +                if not quality_url: +                    continue +                height = int_or_none(quality.get('height')) +                formats.append({ +                    'format_id': '%s-%dp' % (ext, height) if height else ext, +                    'url': quality_url, +                    'width': int_or_none(quality.get('width')), +                    'height': height, +                }) +        self._sort_formats(formats) + +        thumbnails = [] +        for image in video_data.get('images', []): +            image_url = image.get('url') +            if not image_url: +                continue +            thumbnails.append({ +                'url': image_url, +            }) + +        return { +            'id': video_id, +            'title': title, +            'description': video_data.get('description'), +            'duration': int_or_none(video_data.get('duration')), +            'uploader_id': uploader_id, +            'tags': video_data.get('keywords'), +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 23ce0a0d1..0f5d68738 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -37,6 +37,7 @@ class VeohIE(InfoExtractor):                  'uploader': 'afp-news',                  'duration': 123,              }, +            'skip': 'This video has been deleted.',          },          {              'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 1a0ff3395..2cd617b91 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  import json +import re  from .common import InfoExtractor  from ..utils import ( @@ -12,11 +13,11 @@ from ..utils import (  class VesselIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' +    _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'      _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'      _LOGIN_URL = 'https://www.vessel.com/api/account/login'      _NETRC_MACHINE = 'vessel' -    _TEST = { +    _TESTS = [{          'url': 'https://www.vessel.com/videos/HDN7G5UMs',          'md5': '455cdf8beb71c6dd797fd2f3818d05c4',          'info_dict': { @@ -28,7 +29,16 @@ class VesselIE(InfoExtractor):              'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',              'timestamp': int,          }, -    } +    }, { +        'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_urls(webpage): +        return [url for _, url in re.findall( +            r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1', +            webpage)]      @staticmethod      def make_json_request(url, data): @@ -98,16 +108,24 @@ class VesselIE(InfoExtractor):          formats = []          for f in video_asset.get('sources', []): -            if f['name'] == 'hls-index': +            location = f.get('location') +            if not location: +                continue +            name = f.get('name') +            if name == 'hls-index':                  formats.extend(self._extract_m3u8_formats( -                    f['location'], video_id, ext='mp4', m3u8_id='m3u8')) +                    location, video_id, ext='mp4', +                    entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False)) +            elif name == 'dash-index': +                formats.extend(self._extract_mpd_formats( +                    location, video_id, mpd_id='dash', fatal=False))              else:                  formats.append({ -                    'format_id': f['name'], +                    'format_id': name,                      'tbr': f.get('bitrate'),                      'height': f.get('height'),                      'width': f.get('width'), -                    'url': f['location'], +                    'url': location,                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c0ef08c02..388b4debe 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -203,7 +203,8 @@ class VevoIE(VevoBaseIE):          json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id          response = self._download_json( -            json_url, video_id, 'Downloading video info', 'Unable to download info') +            json_url, video_id, 'Downloading video info', +            'Unable to download info', fatal=False) or {}          video_info = response.get('video') or {}          artist = None          featured_artist = None @@ -212,19 +213,17 @@ class VevoIE(VevoBaseIE):          formats = []          if not video_info: -            if response.get('statusCode') != 909: +            try: +                self._initialize_api(video_id) +            except ExtractorError:                  ytid = response.get('errorInfo', {}).get('ytid')                  if ytid:                      self.report_warning(                          'Video is geoblocked, trying with the YouTube video %s' % ytid)                      return self.url_result(ytid, 'Youtube', ytid) -                if 'statusMessage' in response: -                    raise ExtractorError('%s said: %s' % ( -                        self.IE_NAME, response['statusMessage']), expected=True) -                raise ExtractorError('Unable to extract videos') +                raise -            self._initialize_api(video_id)              video_info = self._call_api(                  'video/%s' % video_id, video_id, 'Downloading api video info',                  'Failed to download video info') diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 95daf4dfd..e2b2ce098 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -11,12 +11,14 @@ class ViceIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', +        'md5': 'e9d77741f9e42ba583e683cd170660f7',          'info_dict': {              'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',              'ext': 'flv',              'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',              'duration': 725.983,          }, +        'add_ie': ['Ooyala'],      }, {          'url': 'http://www.vice.com/video/how-to-hack-a-car',          'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', @@ -29,6 +31,7 @@ class ViceIE(InfoExtractor):              'uploader': 'Motherboard',              'upload_date': '20140529',          }, +        'add_ie': ['Youtube'],      }, {          'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',          'only_matching': True, diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py new file mode 100644 index 000000000..6898042de --- /dev/null +++ b/youtube_dl/extractor/vidio.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VidioIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', +        'md5': 'cd2801394afc164e9775db6a140b91fe', +        'info_dict': { +            'id': '165683', +            'display_id': 'dj_ambred-booyah-live-2015', +            'ext': 'mp4', +            'title': 'DJ_AMBRED - Booyah (Live 2015)', +            'description': 'md5:27dc15f819b6a78a626490881adbadf8', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 149, +            'like_count': int, +        }, +    }, { +        'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id, display_id = mobj.group('id', 'display_id') + +        webpage = self._download_webpage(url, display_id) + +        title = self._og_search_title(webpage) + +        m3u8_url, duration, thumbnail = [None] * 3 + +        clips = self._parse_json( +            self._html_search_regex( +                r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', +                webpage, 'video data', default='[]', group='data'), +            display_id, fatal=False) +        if clips: +            clip = clips[0] +            m3u8_url = clip.get('sources', [{}])[0].get('file') +            duration = clip.get('clip_duration') +            thumbnail = clip.get('image') + +        m3u8_url = m3u8_url or self._search_regex( +            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url') +        formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + +        duration = int_or_none(duration or self._search_regex( +            r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) +        thumbnail = thumbnail or self._og_search_thumbnail(webpage) + +        like_count = int_or_none(self._search_regex( +            (r'<span[^>]+data-comment-vote-count=["\'](\d+)', +             r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), +            webpage, 'like count', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': thumbnail, +            'duration': duration, +            'like_count': like_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index dd4a13a4a..19500eba8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -141,6 +141,10 @@ class ViewLiftIE(ViewLiftBaseIE):      }, {          'url': 'http://www.kesari.tv/news/video/1461919076414',          'only_matching': True, +    }, { +        # Was once Kaltura embed +        'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 67220f1b7..79c819bc3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -26,12 +26,16 @@ class VKIE(InfoExtractor):      _VALID_URL = r'''(?x)                      https?://                          (?: -                            (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| +                            (?: +                                (?:m\.)?vk\.com/video_| +                                (?:www\.)?daxab.com/ +                            ) +                            ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|                              (?:                                  (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| -                                (?:www\.)?biqle\.ru/watch/ +                                (?:www\.)?daxab.com/embed/                              ) -                            (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) +                            (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?                          )                      '''      _NETRC_MACHINE = 'vk' @@ -75,7 +79,8 @@ class VKIE(InfoExtractor):                  'duration': 101,                  'upload_date': '20120730',                  'view_count': int, -            } +            }, +            'skip': 'This video has been removed from public access.',          },          {              # VIDEO NOW REMOVED @@ -142,7 +147,7 @@ class VKIE(InfoExtractor):                  'id': 'V3K4mi0SYkc',                  'ext': 'webm',                  'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", -                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', +                'description': 'md5:d9903938abdc74c738af77f527ca0596',                  'duration': 178,                  'upload_date': '20130116',                  'uploader': "Children's Joy Foundation", @@ -174,11 +179,6 @@ class VKIE(InfoExtractor):              'only_matching': True,          },          { -            # vk wrapper -            'url': 'http://www.biqle.ru/watch/847655_160197695', -            'only_matching': True, -        }, -        {              # pladform embed              'url': 'https://vk.com/video-76116461_171554880',              'only_matching': True, @@ -217,20 +217,21 @@ class VKIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('videoid') -        if not video_id: +        if video_id: +            info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id +            # Some videos (removed?) can only be downloaded with list id specified +            list_id = mobj.group('list_id') +            if list_id: +                info_url += '&list=%s' % list_id +        else: +            info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')              video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) -        info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id - -        # Some videos (removed?) can only be downloaded with list id specified -        list_id = mobj.group('list_id') -        if list_id: -            info_url += '&list=%s' % list_id -          info_page = self._download_webpage(info_url, video_id)          error_message = self._html_search_regex( -            r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', +            [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', +                r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],              info_page, 'error message', default=None)          if error_message:              raise ExtractorError(error_message, expected=True) @@ -305,17 +306,17 @@ class VKIE(InfoExtractor):          view_count = None          views = self._html_search_regex(              r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', -            info_page, 'view count', fatal=False) +            info_page, 'view count', default=None)          if views:              view_count = str_to_int(self._search_regex(                  r'([\d,.]+)', views, 'view count', fatal=False))          formats = []          for k, v in data.items(): -            if not k.startswith('url') and k != 'extra_data' or not v: +            if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:                  continue              height = int_or_none(self._search_regex( -                r'^url(\d+)', k, 'height', default=None)) +                r'^(?:url|cache)(\d+)', k, 'height', default=None))              formats.append({                  'format_id': k,                  'url': v, diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index a672ea9c5..8d671cca7 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,8 +1,7 @@  # coding: utf-8 -from __future__ import division, unicode_literals +from __future__ import unicode_literals  import re -import time  from .common import InfoExtractor  from ..utils import ( @@ -10,6 +9,7 @@ from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    remove_start,  )  from ..compat import compat_urllib_parse_urlencode @@ -23,7 +23,7 @@ class VLiveIE(InfoExtractor):          'info_dict': {              'id': '1326',              'ext': 'mp4', -            'title': "[V] Girl's Day's Broadcast", +            'title': "[V LIVE] Girl's Day's Broadcast",              'creator': "Girl's Day",              'view_count': int,          }, @@ -35,24 +35,12 @@ class VLiveIE(InfoExtractor):          webpage = self._download_webpage(              'http://www.vlive.tv/video/%s' % video_id, video_id) -        # UTC+x - UTC+9 (KST) -        tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone -        tz_offset = -tz // 60 - 9 * 60 -        self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset) - -        status_params = self._download_json( -            'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, -            video_id, 'Downloading JSON status', -            headers={'Referer': url.encode('utf-8')}) -        status = status_params.get('status') -        air_start = status_params.get('onAirStartAt', '') -        is_live = status_params.get('isLive') -          video_params = self._search_regex( -            r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)', +            r'\bvlive\.video\.init\(([^)]+)\)',              webpage, 'video params') -        live_params, long_video_id, key = re.split( -            r'"\s*,\s*"', video_params)[1:4] +        status, _, _, live_params, long_video_id, key = re.split( +            r'"\s*,\s*"', video_params)[2:8] +        status = remove_start(status, 'PRODUCT_')          if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':              live_params = self._parse_json('"%s"' % live_params, video_id) @@ -61,8 +49,6 @@ class VLiveIE(InfoExtractor):          elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':              if long_video_id and key:                  return self._replay(video_id, webpage, long_video_id, key) -            elif is_live: -                status = 'LIVE_END'              else:                  status = 'COMING_SOON' @@ -70,7 +56,7 @@ class VLiveIE(InfoExtractor):              raise ExtractorError('Uploading for replay. Please wait...',                                   expected=True)          elif status == 'COMING_SOON': -            raise ExtractorError('Coming soon! %s' % air_start, expected=True) +            raise ExtractorError('Coming soon!', expected=True)          elif status == 'CANCELED':              raise ExtractorError('We are sorry, '                                   'but the live broadcast has been canceled.', diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index 0c6b1f030..b1b32ad44 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -15,7 +15,8 @@ class VoxMediaIE(InfoExtractor):              'ext': 'mp4',              'title': 'Google\'s new material design direction',              'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', -        } +        }, +        'add_ie': ['Ooyala'],      }, {          # data-ooyala-id          'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', @@ -25,7 +26,8 @@ class VoxMediaIE(InfoExtractor):              'ext': 'mp4',              'title': 'The Nexus 6: hands-on with Google\'s phablet',              'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', -        } +        }, +        'add_ie': ['Ooyala'],      }, {          # volume embed          'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', @@ -35,7 +37,8 @@ class VoxMediaIE(InfoExtractor):              'ext': 'mp4',              'title': 'The new frontier of LGBTQ civil rights, explained',              'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', -        } +        }, +        'add_ie': ['Ooyala'],      }, {          # youtube embed          'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', @@ -48,7 +51,8 @@ class VoxMediaIE(InfoExtractor):              'upload_date': '20160324',              'uploader_id': 'voxdotcom',              'uploader': 'Vox', -        } +        }, +        'add_ie': ['Youtube'],      }, {          # SBN.VideoLinkset.entryGroup multiple ooyala embeds          'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', @@ -117,7 +121,7 @@ class VoxMediaIE(InfoExtractor):              volume_webpage = self._download_webpage(                  'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid)              video_data = self._parse_json(self._search_regex( -                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) +                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)              for provider_video_type in ('ooyala', 'youtube'):                  provider_video_id = video_data.get('%s_id' % provider_video_type)                  if provider_video_id: diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index ec8b99998..839cad986 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -11,7 +11,96 @@ from ..utils import (  class WashingtonPostIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' +    IE_NAME = 'washingtonpost' +    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _TEST = { +        'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', +        'md5': '6f537e1334b714eb15f9563bd4b9cdfa', +        'info_dict': { +            'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d', +            'ext': 'mp4', +            'title': 'Egypt finds belongings, debris from plane crash', +            'description': 'md5:a17ceee432f215a5371388c1f680bd86', +            'upload_date': '20160520', +            'uploader': 'Reuters', +            'timestamp': 1463778452, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        video_data = self._download_json( +            'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, +            video_id, transform_source=strip_jsonp)[0]['contentConfig'] +        title = video_data['title'] + +        urls = [] +        formats = [] +        for s in video_data.get('streams', []): +            s_url = s.get('url') +            if not s_url or s_url in urls: +                continue +            urls.append(s_url) +            video_type = s.get('type') +            if video_type == 'smil': +                continue +            elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): +                m3u8_formats = self._extract_m3u8_formats( +                    s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +                for m3u8_format in m3u8_formats: +                    width = m3u8_format.get('width') +                    if not width: +                        continue +                    vbr = self._search_regex( +                        r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) +                    if vbr: +                        m3u8_format.update({ +                            'vbr': int_or_none(vbr), +                        }) +                formats.extend(m3u8_formats) +            else: +                width = int_or_none(s.get('width')) +                vbr = int_or_none(s.get('bitrate')) +                has_width = width != 0 +                formats.append({ +                    'format_id': ( +                        '%s-%d-%d' % (video_type, width, vbr) +                        if width +                        else video_type), +                    'vbr': vbr if has_width else None, +                    'width': width, +                    'height': int_or_none(s.get('height')), +                    'acodec': s.get('audioCodec'), +                    'vcodec': s.get('videoCodec') if has_width else 'none', +                    'filesize': int_or_none(s.get('fileSize')), +                    'url': s_url, +                    'ext': 'mp4', +                    'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, +                }) +        source_media_url = video_data.get('sourceMediaURL') +        if source_media_url: +            formats.append({ +                'format_id': 'source_media', +                'url': source_media_url, +            }) +        self._sort_formats( +            formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) + +        return { +            'id': video_id, +            'title': title, +            'description': video_data.get('blurb'), +            'uploader': video_data.get('credits', {}).get('source'), +            'formats': formats, +            'duration': int_or_none(video_data.get('videoDuration'), 100), +            'timestamp': int_or_none( +                video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), +        } + + +class WashingtonPostArticleIE(InfoExtractor): +    IE_NAME = 'washingtonpost:article' +    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'      _TESTS = [{          'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',          'info_dict': { @@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor):          }]      }] +    @classmethod +    def suitable(cls, url): +        return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url) +      def _real_extract(self, url):          page_id = self._match_id(url)          webpage = self._download_webpage(url, page_id) @@ -74,54 +167,7 @@ class WashingtonPostIE(InfoExtractor):                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|                  data-video-uuid=              )"([^"]+)"''', webpage) -        entries = [] -        for i, uuid in enumerate(uuids, start=1): -            vinfo_all = self._download_json( -                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid, -                page_id, -                transform_source=strip_jsonp, -                note='Downloading information of video %d/%d' % (i, len(uuids)) -            ) -            vinfo = vinfo_all[0]['contentConfig'] -            uploader = vinfo.get('credits', {}).get('source') -            timestamp = int_or_none( -                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000) - -            formats = [{ -                'format_id': ( -                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate')) -                    if s.get('width') -                    else s.get('type')), -                'vbr': s.get('bitrate') if s.get('width') != 0 else None, -                'width': s.get('width'), -                'height': s.get('height'), -                'acodec': s.get('audioCodec'), -                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none', -                'filesize': s.get('fileSize'), -                'url': s.get('url'), -                'ext': 'mp4', -                'preference': -100 if s.get('type') == 'smil' else None, -                'protocol': { -                    'MP4': 'http', -                    'F4F': 'f4m', -                }.get(s.get('type')), -            } for s in vinfo.get('streams', [])] -            source_media_url = vinfo.get('sourceMediaURL') -            if source_media_url: -                formats.append({ -                    'format_id': 'source_media', -                    'url': source_media_url, -                }) -            self._sort_formats(formats) -            entries.append({ -                'id': uuid, -                'title': vinfo['title'], -                'description': vinfo.get('blurb'), -                'uploader': uploader, -                'formats': formats, -                'duration': int_or_none(vinfo.get('videoDuration'), 100), -                'timestamp': timestamp, -            }) +        entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]          return {              '_type': 'playlist', diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 5227bb5ad..de7d6b559 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -2,25 +2,26 @@  from __future__ import unicode_literals  import re -import hashlib  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      ExtractorError,      unified_strdate, +    HEADRequest, +    float_or_none,  )  class WatIE(InfoExtractor): -    _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' +    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'      IE_NAME = 'wat.tv'      _TESTS = [          {              'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', -            'md5': 'ce70e9223945ed26a8056d413ca55dc9', +            'md5': '83d882d9de5c9d97f0bb2c6273cde56a',              'info_dict': {                  'id': '11713067', -                'display_id': 'soupe-figues-l-orange-aux-epices',                  'ext': 'mp4',                  'title': 'Soupe de figues à l\'orange et aux épices',                  'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', @@ -33,7 +34,6 @@ class WatIE(InfoExtractor):              'md5': 'fbc84e4378165278e743956d9c1bf16b',              'info_dict': {                  'id': '11713075', -                'display_id': 'gregory-lemarchal-voix-ange',                  'ext': 'mp4',                  'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',                  'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', @@ -44,96 +44,85 @@ class WatIE(InfoExtractor):          },      ] -    def download_video_info(self, real_id): -        # 'contentv4' is used in the website, but it also returns the related -        # videos, we don't need them -        info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) -        return info['media'] -      def _real_extract(self, url): -        def real_id_for_chapter(chapter): -            return chapter['tc_start'].split('-')[0] -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('display_id') -        real_id = mobj.group('real_id') -        if not real_id: -            short_id = mobj.group('short_id') -            webpage = self._download_webpage(url, display_id or short_id) -            real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') +        video_id = self._match_id(url) +        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) -        video_info = self.download_video_info(real_id) +        # 'contentv4' is used in the website, but it also returns the related +        # videos, we don't need them +        video_info = self._download_json( +            'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']          error_desc = video_info.get('error_desc')          if error_desc:              raise ExtractorError(                  '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) -        geo_list = video_info.get('geoList') -        country = geo_list[0] if geo_list else '' -          chapters = video_info['chapters']          first_chapter = chapters[0] -        files = video_info['files'] -        first_file = files[0] -        if real_id_for_chapter(first_chapter) != real_id: -            self.to_screen('Multipart video detected') -            chapter_urls = [] -            for chapter in chapters: -                chapter_id = real_id_for_chapter(chapter) -                # Yes, when we this chapter is processed by WatIE, -                # it will download the info again -                chapter_info = self.download_video_info(chapter_id) -                chapter_urls.append(chapter_info['url']) -            entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] -            return self.playlist_result(entries, real_id, video_info['title']) +        def video_id_for_chapter(chapter): +            return chapter['tc_start'].split('-')[0] -        upload_date = None -        if 'date_diffusion' in first_chapter: -            upload_date = unified_strdate(first_chapter['date_diffusion']) +        if video_id_for_chapter(first_chapter) != video_id: +            self.to_screen('Multipart video detected') +            entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] +            return self.playlist_result(entries, video_id, video_info['title'])          # Otherwise we can continue and extract just one part, we have to use -        # the short id for getting the video url - -        formats = [{ -            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, -            'format_id': 'Mobile', -        }] - -        fmts = [('SD', 'web')] -        if first_file.get('hasHD'): -            fmts.append(('HD', 'webhd')) - -        def compute_token(param): -            timestamp = '%08x' % int(self._download_webpage( -                'http://www.wat.tv/servertime', real_id, -                'Downloading server time').split('|')[0]) -            magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' -            return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) - -        for fmt in fmts: -            webid = '/%s/%s' % (fmt[1], real_id) -            video_url = self._download_webpage( -                'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), -                real_id, -                'Downloading %s video URL' % fmt[0], -                'Failed to download %s video URL' % fmt[0], -                False) -            if not video_url: +        # the video id for getting the video url + +        date_diffusion = first_chapter.get('date_diffusion') +        upload_date = unified_strdate(date_diffusion) if date_diffusion else None + +        def extract_url(path_template, url_type): +            req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) +            head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type) +            red_url = head.geturl() +            if req_url == red_url: +                raise ExtractorError( +                    '%s said: Sorry, this video is not available from your country.' % self.IE_NAME, +                    expected=True) +            return red_url + +        m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') +        http_url = extract_url('android5/%s.mp4', 'http') + +        formats = [] +        m3u8_formats = self._extract_m3u8_formats( +            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') +        formats.extend(m3u8_formats) +        formats.extend(self._extract_f4m_formats( +            m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), +            video_id, f4m_id='hds', fatal=False)) +        for m3u8_format in m3u8_formats: +            mobj = re.search( +                r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url']) +            if not mobj:                  continue -            formats.append({ -                'url': video_url, -                'ext': 'mp4', -                'format_id': fmt[0], +            abr, vbr = mobj.groups() +            abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) +            m3u8_format.update({ +                'vbr': vbr, +                'abr': abr, +            }) +            if not vbr or not abr: +                continue +            f = m3u8_format.copy() +            f.update({ +                'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), +                'format_id': f['format_id'].replace('hls', 'http'), +                'protocol': 'http',              }) +            formats.append(f) +        self._sort_formats(formats)          return { -            'id': real_id, -            'display_id': display_id, +            'id': video_id,              'title': first_chapter['title'],              'thumbnail': first_chapter['preview'],              'description': first_chapter['description'],              'view_count': video_info['views'],              'upload_date': upload_date, -            'duration': first_file['duration'], +            'duration': video_info['files'][0]['duration'],              'formats': formats,          } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 31c904303..a9238cbeb 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,214 +1,222 @@  # -*- coding: utf-8 -*-  from __future__ import unicode_literals -import itertools  import re  from .common import InfoExtractor -from ..compat import ( -    compat_parse_qs, -    compat_urlparse, -)  from ..utils import ( +    determine_ext, +    ExtractorError, +    js_to_json, +    strip_jsonp,      unified_strdate, -    qualities, +    update_url_query, +    urlhandle_detect_ext,  )  class WDRIE(InfoExtractor): -    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' -    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX +    _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +    _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html' +    _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL      _TESTS = [          { -            'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', +            'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', +            # HDS download, MD5 is unstable              'info_dict': { -                'id': 'mdb-362427', +                'id': 'mdb-1058683',                  'ext': 'flv', -                'title': 'Servicezeit', -                'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', -                'upload_date': '20140310', -                'is_live': False +                'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', +                'title': 'Geheimnis Aachener Dom', +                'alt_title': 'Doku am Freitag', +                'upload_date': '20160304', +                'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', +                'is_live': False, +                'subtitles': {'de': [{ +                    'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' +                }]},              }, -            'params': { -                'skip_download': True, +        }, +        { +            'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', +            'md5': 'f4c1f96d01cf285240f53ea4309663d8', +            'info_dict': { +                'id': 'mdb-1072000', +                'ext': 'mp3', +                'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', +                'title': 'Schriftstellerin Juli Zeh', +                'alt_title': 'WDR 3 Gespräch am Samstag', +                'upload_date': '20160312', +                'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', +                'is_live': False, +                'subtitles': {}              }, -            'skip': 'Page Not Found',          },          { -            'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', +            'url': 'http://www1.wdr.de/mediathek/video/live/index.html',              'info_dict': { -                'id': 'mdb-363194', -                'ext': 'flv', -                'title': 'Marga Spiegel ist tot', -                'description': 'md5:2309992a6716c347891c045be50992e4', -                'upload_date': '20140311', -                'is_live': False +                'id': 'mdb-103364', +                'ext': 'mp4', +                'display_id': 'index', +                'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +                'alt_title': 'WDR Fernsehen Live', +                'upload_date': None, +                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', +                'is_live': True, +                'subtitles': {}              },              'params': { -                'skip_download': True, +                'skip_download': True,  # m3u8 download              }, -            'skip': 'Page Not Found',          },          { -            'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', -            'md5': '83e9e8fefad36f357278759870805898', +            'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', +            'playlist_mincount': 8,              'info_dict': { -                'id': 'mdb-194332', -                'ext': 'mp3', -                'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', -                'description': 'md5:2309992a6716c347891c045be50992e4', -                'upload_date': '20091129', -                'is_live': False +                'id': 'aktuelle-stunde/aktuelle-stunde-120',              },          },          { -            'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', -            'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', +            'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',              'info_dict': { -                'id': 'mdb-478135', -                'ext': 'mp3', -                'title': 'Flavia Coelho: Amar é Amar', -                'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', -                'upload_date': '20140717', -                'is_live': False +                'id': 'mdb-1096487', +                'ext': 'flv', +                'upload_date': 're:^[0-9]{8}$', +                'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', +                'description': '- Die Sendung mit der Maus -',              }, -            'skip': 'Page Not Found', +            'skip': 'The id changes from week to week because of the new episode'          },          { -            'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', -            'playlist_mincount': 146, +            'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', +            # HDS download, MD5 is unstable              'info_dict': { -                'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', -            } +                'id': 'mdb-186083', +                'ext': 'flv', +                'upload_date': '20130919', +                'title': 'Sachgeschichte - Achterbahn ', +                'description': '- Die Sendung mit der Maus -', +            },          },          { -            'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', +            'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', +            # Live stream, MD5 unstable              'info_dict': { -                'id': 'mdb-103364', -                'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', +                'id': 'mdb-869971',                  'ext': 'flv', -                'upload_date': '20150101', -                'is_live': True -            }, -            'params': { -                'skip_download': True, +                'title': 'Funkhaus Europa Livestream', +                'description': 'md5:2309992a6716c347891c045be50992e4', +                'upload_date': '20160101',              },          }      ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        page_url = mobj.group('url') -        page_id = mobj.group('id') - -        webpage = self._download_webpage(url, page_id) - -        if mobj.group('player') is None: +        url_type = mobj.group('type') +        page_url = mobj.group('page_url') +        display_id = mobj.group('display_id') +        webpage = self._download_webpage(url, display_id) + +        # for wdr.de the data-extension is in a tag with the class "mediaLink" +        # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" +        # for wdrmaus its in a link to the page in a multiline "videoLink"-tag +        json_metadata = self._html_search_regex( +            r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', +            webpage, 'media link', default=None, flags=re.MULTILINE) + +        if not json_metadata:              entries = [ -                self.url_result(page_url + href, 'WDR') +                self.url_result(page_url + href[0], 'WDR')                  for href in re.findall( -                    r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, +                    r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX,                      webpage)              ]              if entries:  # Playlist page -                return self.playlist_result(entries, page_id) - -            # Overview page -            entries = [] -            for page_num in itertools.count(2): -                hrefs = re.findall( -                    r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"', -                    webpage) -                entries.extend( -                    self.url_result(page_url + href, 'WDR') -                    for href in hrefs) -                next_url_m = re.search( -                    r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage) -                if not next_url_m: -                    break -                next_url = page_url + next_url_m.group(1) -                webpage = self._download_webpage( -                    next_url, page_id, -                    note='Downloading playlist page %d' % page_num) -            return self.playlist_result(entries, page_id) +                return self.playlist_result(entries, playlist_id=display_id) -        flashvars = compat_parse_qs(self._html_search_regex( -            r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars')) +            raise ExtractorError('No downloadable streams found', expected=True) -        page_id = flashvars['trackerClipId'][0] -        video_url = flashvars['dslSrc'][0] -        title = flashvars['trackerClipTitle'][0] -        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None -        is_live = flashvars.get('isLive', ['0'])[0] == '1' +        media_link_obj = self._parse_json(json_metadata, display_id, +                                          transform_source=js_to_json) +        jsonp_url = media_link_obj['mediaObj']['url'] -        if is_live: -            title = self._live_title(title) - -        if 'trackerClipAirTime' in flashvars: -            upload_date = flashvars['trackerClipAirTime'][0] -        else: -            upload_date = self._html_search_meta( -                'DC.Date', webpage, 'upload date') +        metadata = self._download_json( +            jsonp_url, 'metadata', transform_source=strip_jsonp) -        if upload_date: -            upload_date = unified_strdate(upload_date) +        metadata_tracker_data = metadata['trackerData'] +        metadata_media_resource = metadata['mediaResource']          formats = [] -        preference = qualities(['S', 'M', 'L', 'XL']) -        if video_url.endswith('.f4m'): -            formats.extend(self._extract_f4m_formats( -                video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id, -                f4m_id='hds', fatal=False)) -        elif video_url.endswith('.smil'): -            formats.extend(self._extract_smil_formats( -                video_url, page_id, False, { -                    'hdcore': '3.3.0', -                    'plugin': 'aasp-3.3.0.99.43', -                })) -        else: -            formats.append({ -                'url': video_url, -                'http_headers': { -                    'User-Agent': 'mobile', -                }, -            }) +        # check if the metadata contains a direct URL to a file +        for kind, media_resource in metadata_media_resource.items(): +            if kind not in ('dflt', 'alt'): +                continue + +            for tag_name, medium_url in media_resource.items(): +                if tag_name not in ('videoURL', 'audioURL'): +                    continue + +                ext = determine_ext(medium_url) +                if ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        medium_url, display_id, 'mp4', 'm3u8_native', +                        m3u8_id='hls')) +                elif ext == 'f4m': +                    manifest_url = update_url_query( +                        medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) +                    formats.extend(self._extract_f4m_formats( +                        manifest_url, display_id, f4m_id='hds', fatal=False)) +                elif ext == 'smil': +                    formats.extend(self._extract_smil_formats( +                        medium_url, 'stream', fatal=False)) +                else: +                    a_format = { +                        'url': medium_url +                    } +                    if ext == 'unknown_video': +                        urlh = self._request_webpage( +                            medium_url, display_id, note='Determining extension') +                        ext = urlhandle_detect_ext(urlh) +                        a_format['ext'] = ext +                    formats.append(a_format) -        m3u8_url = self._search_regex( -            r'rel="adaptiv"[^>]+href="([^"]+)"', -            webpage, 'm3u8 url', default=None) -        if m3u8_url: -            formats.extend(self._extract_m3u8_formats( -                m3u8_url, page_id, 'mp4', 'm3u8_native', -                m3u8_id='hls', fatal=False)) +        self._sort_formats(formats) -        direct_urls = re.findall( -            r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) -        if direct_urls: -            for quality, video_url in direct_urls: -                formats.append({ -                    'url': video_url, -                    'preference': preference(quality), -                    'http_headers': { -                        'User-Agent': 'mobile', -                    }, -                }) +        subtitles = {} +        caption_url = metadata_media_resource.get('captionURL') +        if caption_url: +            subtitles['de'] = [{ +                'url': caption_url +            }] -        self._sort_formats(formats) +        title = metadata_tracker_data.get('trackerClipTitle') +        is_live = url_type == 'live' + +        if is_live: +            title = self._live_title(title) +            upload_date = None +        elif 'trackerClipAirTime' in metadata_tracker_data: +            upload_date = metadata_tracker_data['trackerClipAirTime'] +        else: +            upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') -        description = self._html_search_meta('Description', webpage, 'description') +        if upload_date: +            upload_date = unified_strdate(upload_date)          return { -            'id': page_id, -            'formats': formats, +            'id': metadata_tracker_data.get('trackerClipId', display_id), +            'display_id': display_id,              'title': title, -            'description': description, -            'thumbnail': thumbnail, +            'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), +            'formats': formats,              'upload_date': upload_date, -            'is_live': is_live +            'description': self._html_search_meta('Description', webpage), +            'is_live': is_live, +            'subtitles': subtitles,          } @@ -241,81 +249,3 @@ class WDRMobileIE(InfoExtractor):                  'User-Agent': 'mobile',              },          } - - -class WDRMausIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' -    IE_DESC = 'Sendung mit der Maus' -    _TESTS = [{ -        'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', -        'info_dict': { -            'id': 'aktuelle-sendung', -            'ext': 'mp4', -            'thumbnail': 're:^http://.+\.jpg', -            'upload_date': 're:^[0-9]{8}$', -            'title': 're:^[0-9.]{10} - Aktuelle Sendung$', -        } -    }, { -        'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5', -        'md5': '3b1227ca3ed28d73ec5737c65743b2a3', -        'info_dict': { -            'id': '40_jahre_maus', -            'ext': 'mp4', -            'thumbnail': 're:^http://.+\.jpg', -            'upload_date': '20131007', -            'title': '12.03.2011 - 40 Jahre Maus', -        } -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        webpage = self._download_webpage(url, video_id) -        param_code = self._html_search_regex( -            r'<a href="\?startVideo=1&([^"]+)"', webpage, 'parameters') - -        title_date = self._search_regex( -            r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>', -            webpage, 'air date') -        title_str = self._html_search_regex( -            r'<h1>(.*?)</h1>', webpage, 'title') -        title = '%s - %s' % (title_date, title_str) -        upload_date = unified_strdate( -            self._html_search_meta('dc.date', webpage)) - -        fields = compat_parse_qs(param_code) -        video_url = fields['firstVideo'][0] -        thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) - -        formats = [{ -            'format_id': 'rtmp', -            'url': video_url, -        }] - -        jscode = self._download_webpage( -            'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', -            video_id, fatal=False, -            note='Downloading URL translation table', -            errnote='Could not download URL translation table') -        if jscode: -            for m in re.finditer( -                    r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}", -                    jscode): -                if video_url.startswith(m.group('stream')): -                    http_url = video_url.replace( -                        m.group('stream'), m.group('dl')) -                    formats.append({ -                        'format_id': 'http', -                        'url': http_url, -                    }) -                    break - -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': title, -            'formats': formats, -            'thumbnail': thumbnail, -            'upload_date': upload_date, -        } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 8b14840a2..c634b8dec 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    sanitized_Request,      int_or_none, +    float_or_none,  )  class WistiaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' -    _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json' +    _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)' +    _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' +    _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' -    _TEST = { +    _TESTS = [{          'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',          'md5': 'cafeb56ec0c53c18c97405eecb3133df',          'info_dict': { @@ -24,36 +25,54 @@ class WistiaIE(InfoExtractor):              'timestamp': 1386185018,              'duration': 117,          }, -    } +    }, { +        'url': 'wistia:sh7fpupwlt', +        'only_matching': True, +    }, { +        # with hls video +        'url': 'wistia:807fafadvk', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        request = sanitized_Request(self._API_URL.format(video_id)) -        request.add_header('Referer', url)  # Some videos require this. -        data_json = self._download_json(request, video_id) +        data_json = self._download_json( +            self._API_URL % video_id, video_id, +            # Some videos require this. +            headers={ +                'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id, +            }) +          if data_json.get('error'): -            raise ExtractorError('Error while getting the playlist', -                                 expected=True) +            raise ExtractorError( +                'Error while getting the playlist', expected=True) +          data = data_json['media']          title = data['name']          formats = []          thumbnails = []          for a in data['assets']: +            aurl = a.get('url') +            if not aurl: +                continue              astatus = a.get('status')              atype = a.get('type') -            if (astatus is not None and astatus != 2) or atype == 'preview': +            if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):                  continue              elif atype in ('still', 'still_image'):                  thumbnails.append({ -                    'url': a['url'], -                    'resolution': '%dx%d' % (a['width'], a['height']), +                    'url': aurl, +                    'width': int_or_none(a.get('width')), +                    'height': int_or_none(a.get('height')),                  })              else: +                aext = a.get('ext') +                is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8'                  formats.append({                      'format_id': atype, -                    'url': a['url'], +                    'url': aurl,                      'tbr': int_or_none(a.get('bitrate')),                      'vbr': int_or_none(a.get('opt_vbitrate')),                      'width': int_or_none(a.get('width')), @@ -61,7 +80,8 @@ class WistiaIE(InfoExtractor):                      'filesize': int_or_none(a.get('size')),                      'vcodec': a.get('codec'),                      'container': a.get('container'), -                    'ext': a.get('ext'), +                    'ext': 'mp4' if is_m3u8 else aext, +                    'protocol': 'm3u8' if is_m3u8 else None,                      'preference': 1 if atype == 'original' else None,                  }) @@ -73,6 +93,6 @@ class WistiaIE(InfoExtractor):              'description': data.get('seoDescription'),              'formats': formats,              'thumbnails': thumbnails, -            'duration': int_or_none(data.get('duration')), +            'duration': float_or_none(data.get('duration')),              'timestamp': int_or_none(data.get('createdAt')),          } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b3547174d..bd8e1af2e 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -12,37 +12,52 @@ from ..utils import (  class XHamsterIE(InfoExtractor): -    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' -    _TESTS = [ -        { -            'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', -            'info_dict': { -                'id': '1509445', -                'ext': 'mp4', -                'title': 'FemaleAgent Shy beauty takes the bait', -                'upload_date': '20121014', -                'uploader': 'Ruseful2011', -                'duration': 893.52, -                'age_limit': 18, -            } +    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' +    _TESTS = [{ +        'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', +        'md5': '8281348b8d3c53d39fffb377d24eac4e', +        'info_dict': { +            'id': '1509445', +            'ext': 'mp4', +            'title': 'FemaleAgent Shy beauty takes the bait', +            'upload_date': '20121014', +            'uploader': 'Ruseful2011', +            'duration': 893.52, +            'age_limit': 18,          }, -        { -            'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', -            'info_dict': { -                'id': '2221348', -                'ext': 'mp4', -                'title': 'Britney Spears  Sexy Booty', -                'upload_date': '20130914', -                'uploader': 'jojo747400', -                'duration': 200.48, -                'age_limit': 18, -            } +    }, { +        'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', +        'info_dict': { +            'id': '2221348', +            'ext': 'mp4', +            'title': 'Britney Spears  Sexy Booty', +            'upload_date': '20130914', +            'uploader': 'jojo747400', +            'duration': 200.48, +            'age_limit': 18, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # empty seo +        'url': 'http://xhamster.com/movies/5667973/.html', +        'info_dict': { +            'id': '5667973', +            'ext': 'mp4', +            'title': '....', +            'upload_date': '20160208', +            'uploader': 'parejafree', +            'duration': 72.0, +            'age_limit': 18,          }, -        { -            'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', -            'only_matching': True, +        'params': { +            'skip_download': True,          }, -    ] +    }, { +        'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          def extract_video_url(webpage, name): @@ -170,7 +185,7 @@ class XHamsterEmbedIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          video_url = self._search_regex( -            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, +            r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id,              webpage, 'xhamster url', default=None)          if not video_url: diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 710ad5041..1dfe031ca 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -8,7 +8,6 @@ from ..utils import (      clean_html,      ExtractorError,      determine_ext, -    sanitized_Request,  ) @@ -25,8 +24,6 @@ class XVideosIE(InfoExtractor):          }      } -    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' -      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -35,31 +32,34 @@ class XVideosIE(InfoExtractor):          if mobj:              raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) -        video_url = compat_urllib_parse_unquote( -            self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))          video_title = self._html_search_regex(              r'<title>(.*?)\s+-\s+XVID', webpage, 'title')          video_thumbnail = self._search_regex(              r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) -        formats = [{ -            'url': video_url, -        }] +        formats = [] -        android_req = sanitized_Request(url) -        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) -        android_webpage = self._download_webpage(android_req, video_id, fatal=False) +        video_url = compat_urllib_parse_unquote(self._search_regex( +            r'flv_url=(.+?)&', webpage, 'video URL', default='')) +        if video_url: +            formats.append({'url': video_url}) -        if android_webpage is not None: -            player_params_str = self._search_regex( -                'mobileReplacePlayerDivTwoQual\(([^)]+)\)', -                android_webpage, 'player parameters', default='') -            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) -            if player_params: -                formats.extend([{ -                    'url': param, -                    'preference': -10, -                } for param in player_params if determine_ext(param) == 'mp4']) +        player_args = self._search_regex( +            r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) +        if player_args: +            for arg in player_args.split(','): +                format_url = self._search_regex( +                    r'(["\'])(?P<url>https?://.+?)\1', arg, 'url', +                    default=None, group='url') +                if not format_url: +                    continue +                ext = determine_ext(format_url) +                if ext == 'mp4': +                    formats.append({'url': format_url}) +                elif ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, video_id, 'mp4', +                        entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))          self._sort_formats(formats) @@ -67,7 +67,6 @@ class XVideosIE(InfoExtractor):              'id': video_id,              'formats': formats,              'title': video_title, -            'ext': 'flv',              'thumbnail': video_thumbnail,              'age_limit': 18,          } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index ce3723b55..b37d0eab6 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,8 +10,6 @@ from ..utils import (      ExtractorError,      int_or_none,      float_or_none, -    sanitized_Request, -    urlencode_postdata,  ) @@ -22,18 +20,24 @@ class YandexMusicBaseIE(InfoExtractor):              error = response.get('error')              if error:                  raise ExtractorError(error, expected=True) +            if response.get('type') == 'captcha' or 'captcha' in response: +                YandexMusicBaseIE._raise_captcha() + +    @staticmethod +    def _raise_captcha(): +        raise ExtractorError( +            'YandexMusic has considered youtube-dl requests automated and ' +            'asks you to solve a CAPTCHA. You can either wait for some ' +            'time until unblocked and optionally use --sleep-interval ' +            'in future or alternatively you can go to https://music.yandex.ru/ ' +            'solve CAPTCHA, then export cookies and pass cookie file to ' +            'youtube-dl with --cookies', +            expected=True)      def _download_webpage(self, *args, **kwargs):          webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs)          if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: -            raise ExtractorError( -                'YandexMusic has considered youtube-dl requests automated and ' -                'asks you to solve a CAPTCHA. You can either wait for some ' -                'time until unblocked and optionally use --sleep-interval ' -                'in future or alternatively you can go to https://music.yandex.ru/ ' -                'solve CAPTCHA, then export cookies and pass cookie file to ' -                'youtube-dl with --cookies', -                expected=True) +            self._raise_captcha()          return webpage      def _download_json(self, *args, **kwargs): @@ -177,7 +181,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):  class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):      IE_NAME = 'yandexmusic:playlist'      IE_DESC = 'Яндекс.Музыка - Плейлист' -    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' +    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'      _TESTS = [{          'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -196,47 +200,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):              'id': '1036',              'title': 'Музыка 90-х',          }, -        'playlist_count': 310, +        'playlist_mincount': 300,          'skip': 'Travis CI servers blocked by YandexMusic',      }]      def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) - -        mu = self._parse_json( -            self._search_regex( -                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), -            playlist_id) - -        playlist = mu['pageData']['playlist'] -        tracks, track_ids = playlist['tracks'], playlist['trackIds'] - -        # tracks dictionary shipped with webpage is limited to 150 tracks, +        mobj = re.match(self._VALID_URL, url) +        tld = mobj.group('tld') +        user = mobj.group('user') +        playlist_id = mobj.group('id') + +        playlist = self._download_json( +            'https://music.yandex.%s/handlers/playlist.jsx' % tld, +            playlist_id, 'Downloading missing tracks JSON', +            fatal=False, +            headers={ +                'Referer': url, +                'X-Requested-With': 'XMLHttpRequest', +                'X-Retpath-Y': url, +            }, +            query={ +                'owner': user, +                'kinds': playlist_id, +                'light': 'true', +                'lang': tld, +                'external-domain': 'music.yandex.%s' % tld, +                'overembed': 'false', +            })['playlist'] + +        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + +        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,          # missing tracks should be retrieved manually.          if len(tracks) < len(track_ids): -            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) -            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) -            request = sanitized_Request( -                'https://music.yandex.ru/handlers/track-entries.jsx', -                urlencode_postdata({ +            present_track_ids = set([ +                compat_str(track['id']) +                for track in tracks if track.get('id')]) +            missing_track_ids = [ +                track_id for track_id in track_ids +                if track_id not in present_track_ids] +            missing_tracks = self._download_json( +                'https://music.yandex.%s/handlers/track-entries.jsx' % tld, +                playlist_id, 'Downloading missing tracks JSON', +                fatal=False, +                headers={ +                    'Referer': url, +                    'X-Requested-With': 'XMLHttpRequest', +                }, +                query={                      'entries': ','.join(missing_track_ids), -                    'lang': mu.get('settings', {}).get('lang', 'en'), -                    'external-domain': 'music.yandex.ru', +                    'lang': tld, +                    'external-domain': 'music.yandex.%s' % tld,                      'overembed': 'false', -                    'sign': mu.get('authData', {}).get('user', {}).get('sign'),                      'strict': 'true', -                })) -            request.add_header('Referer', url) -            request.add_header('X-Requested-With', 'XMLHttpRequest') - -            missing_tracks = self._download_json( -                request, playlist_id, 'Downloading missing tracks JSON', fatal=False) +                })              if missing_tracks:                  tracks.extend(missing_tracks)          return self.playlist_result(              self._build_playlist(tracks),              compat_str(playlist_id), -            playlist['title'], playlist.get('description')) +            playlist.get('title'), playlist.get('description')) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 349ce0941..147608ebe 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,9 @@  from __future__ import unicode_literals  import base64 +import itertools  import random +import re  import string  import time @@ -13,6 +15,7 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    get_element_by_attribute,      sanitized_Request,  ) @@ -275,6 +278,8 @@ class YoukuIE(InfoExtractor):                      'format_id': self.get_format_name(fm),                      'ext': self.parse_ext_l(fm),                      'filesize': int(seg['size']), +                    'width': stream.get('width'), +                    'height': stream.get('height'),                  })          return { @@ -283,3 +288,52 @@ class YoukuIE(InfoExtractor):              'title': title,              'entries': entries,          } + + +class YoukuShowIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html' +    IE_NAME = 'youku:show' + +    _TEST = { +        'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html', +        'info_dict': { +            'id': 'zc7c670be07ff11e48b3f', +            'title': '花千骨 未删减版', +            'description': 'md5:578d4f2145ae3f9128d9d4d863312910', +        }, +        'playlist_count': 50, +    } + +    _PAGE_SIZE = 40 + +    def _find_videos_in_page(self, webpage): +        videos = re.findall( +            r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage) +        return [ +            self.url_result(video_url, YoukuIE.ie_key(), title) +            for video_url, title in videos] + +    def _real_extract(self, url): +        show_id = self._match_id(url) +        webpage = self._download_webpage(url, show_id) + +        entries = self._find_videos_in_page(webpage) + +        playlist_title = self._html_search_regex( +            r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False) +        detail_div = get_element_by_attribute('class', 'detail', webpage) or '' +        playlist_description = self._html_search_regex( +            r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>', +            detail_div, 'playlist description', fatal=False) + +        for idx in itertools.count(1): +            episodes_page = self._download_webpage( +                'http://www.youku.com/show_episode/id_%s.html' % show_id, +                show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)}, +                note='Downloading episodes page %d' % idx) +            new_entries = self._find_videos_in_page(episodes_page) +            entries.extend(new_entries) +            if len(new_entries) < self._PAGE_SIZE: +                break + +        return self.playlist_result(entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b7c3cb63f..6c9f77d95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -344,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},          '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, +        '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, +        '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},          # Dash webm          '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -1326,9 +1328,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          if video_description:              video_description = re.sub(r'''(?x)                  <a\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    (?:[a-zA-Z-]+="[^"]*"\s+)*?                      (?:title|href)="([^"]+)"\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    (?:[a-zA-Z-]+="[^"]*"\s+)*?                      class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>                  [^<]+\.{3}\s*                  </a> diff --git a/youtube_dl/options.py b/youtube_dl/options.py index d1f8d1331..99ce4131f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -188,7 +188,10 @@ def parseOpts(overrideArguments=None):      network.add_option(          '--proxy', dest='proxy',          default=None, metavar='URL', -        help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') +        help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' +             'SOCKS proxy, specify a proper scheme. For example ' +             'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' +             'for direct connection')      network.add_option(          '--socket-timeout',          dest='socket_timeout', type=float, default=None, metavar='SECONDS', @@ -392,8 +395,8 @@ def parseOpts(overrideArguments=None):      downloader = optparse.OptionGroup(parser, 'Download Options')      downloader.add_option( -        '-r', '--rate-limit', -        dest='ratelimit', metavar='LIMIT', +        '-r', '--limit-rate', '--rate-limit', +        dest='ratelimit', metavar='RATE',          help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)')      downloader.add_option(          '-R', '--retries', @@ -665,7 +668,7 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='writeannotations', default=False,          help='Write video annotations to a .annotations.xml file')      filesystem.add_option( -        '--load-info', +        '--load-info-json', '--load-info',          dest='load_info_filename', metavar='FILE',          help='JSON file containing the video information (created with the "--write-info-json" option)')      filesystem.add_option( diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 74f66d669..90630c2d7 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import subprocess  from .common import PostProcessor -from ..compat import shlex_quote +from ..compat import compat_shlex_quote  from ..utils import PostProcessingError @@ -17,7 +17,7 @@ class ExecAfterDownloadPP(PostProcessor):          if '{}' not in cmd:              cmd += ' {}' -        cmd = cmd.replace('{}', shlex_quote(information['filepath'])) +        cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))          self._downloader.to_screen('[exec] Executing command: %s' % cmd)          retCode = subprocess.call(cmd, shell=True) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py new file mode 100644 index 000000000..fd49d7435 --- /dev/null +++ b/youtube_dl/socks.py @@ -0,0 +1,271 @@ +# Public Domain SOCKS proxy protocol implementation +# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3 + +from __future__ import unicode_literals + +# References: +# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol +# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol +# SOCKS5 protocol https://tools.ietf.org/html/rfc1928 +# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929 + +import collections +import socket + +from .compat import ( +    compat_ord, +    compat_struct_pack, +    compat_struct_unpack, +) + +__author__ = 'Timo Schmid <coding@timoschmid.de>' + +SOCKS4_VERSION = 4 +SOCKS4_REPLY_VERSION = 0x00 +# Excerpt from SOCKS4A protocol: +# if the client cannot resolve the destination host's domain name to find its +# IP address, it should set the first three bytes of DSTIP to NULL and the last +# byte to a non-zero value. +SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) + +SOCKS5_VERSION = 5 +SOCKS5_USER_AUTH_VERSION = 0x01 +SOCKS5_USER_AUTH_SUCCESS = 0x00 + + +class Socks4Command(object): +    CMD_CONNECT = 0x01 +    CMD_BIND = 0x02 + + +class Socks5Command(Socks4Command): +    CMD_UDP_ASSOCIATE = 0x03 + + +class Socks5Auth(object): +    AUTH_NONE = 0x00 +    AUTH_GSSAPI = 0x01 +    AUTH_USER_PASS = 0x02 +    AUTH_NO_ACCEPTABLE = 0xFF  # For server response + + +class Socks5AddressType(object): +    ATYP_IPV4 = 0x01 +    ATYP_DOMAINNAME = 0x03 +    ATYP_IPV6 = 0x04 + + +class ProxyError(IOError): +    ERR_SUCCESS = 0x00 + +    def __init__(self, code=None, msg=None): +        if code is not None and msg is None: +            msg = self.CODES.get(code) and 'unknown error' +        super(ProxyError, self).__init__(code, msg) + + +class InvalidVersionError(ProxyError): +    def __init__(self, expected_version, got_version): +        msg = ('Invalid response version from server. Expected {0:02x} got ' +               '{1:02x}'.format(expected_version, got_version)) +        super(InvalidVersionError, self).__init__(0, msg) + + +class Socks4Error(ProxyError): +    ERR_SUCCESS = 90 + +    CODES = { +        91: 'request rejected or failed', +        92: 'request rejected becasue SOCKS server cannot connect to identd on the client', +        93: 'request rejected because the client program and identd report different user-ids' +    } + + +class Socks5Error(ProxyError): +    ERR_GENERAL_FAILURE = 0x01 + +    CODES = { +        0x01: 'general SOCKS server failure', +        0x02: 'connection not allowed by ruleset', +        0x03: 'Network unreachable', +        0x04: 'Host unreachable', +        0x05: 'Connection refused', +        0x06: 'TTL expired', +        0x07: 'Command not supported', +        0x08: 'Address type not supported', +        0xFE: 'unknown username or invalid password', +        0xFF: 'all offered authentication methods were rejected' +    } + + +class ProxyType(object): +    SOCKS4 = 0 +    SOCKS4A = 1 +    SOCKS5 = 2 + +Proxy = collections.namedtuple('Proxy', ( +    'type', 'host', 'port', 'username', 'password', 'remote_dns')) + + +class sockssocket(socket.socket): +    def __init__(self, *args, **kwargs): +        self._proxy = None +        super(sockssocket, self).__init__(*args, **kwargs) + +    def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None): +        assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5) + +        self._proxy = Proxy(proxytype, addr, port, username, password, rdns) + +    def recvall(self, cnt): +        data = b'' +        while len(data) < cnt: +            cur = self.recv(cnt - len(data)) +            if not cur: +                raise IOError('{0} bytes missing'.format(cnt - len(data))) +            data += cur +        return data + +    def _recv_bytes(self, cnt): +        data = self.recvall(cnt) +        return compat_struct_unpack('!{0}B'.format(cnt), data) + +    @staticmethod +    def _len_and_data(data): +        return compat_struct_pack('!B', len(data)) + data + +    def _check_response_version(self, expected_version, got_version): +        if got_version != expected_version: +            self.close() +            raise InvalidVersionError(expected_version, got_version) + +    def _resolve_address(self, destaddr, default, use_remote_dns): +        try: +            return socket.inet_aton(destaddr) +        except socket.error: +            if use_remote_dns and self._proxy.remote_dns: +                return default +            else: +                return socket.inet_aton(socket.gethostbyname(destaddr)) + +    def _setup_socks4(self, address, is_4a=False): +        destaddr, port = address + +        ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) + +        packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + +        username = (self._proxy.username or '').encode('utf-8') +        packet += username + b'\x00' + +        if is_4a and self._proxy.remote_dns: +            packet += destaddr.encode('utf-8') + b'\x00' + +        self.sendall(packet) + +        version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) + +        self._check_response_version(SOCKS4_REPLY_VERSION, version) + +        if resp_code != Socks4Error.ERR_SUCCESS: +            self.close() +            raise Socks4Error(resp_code) + +        return (dsthost, dstport) + +    def _setup_socks4a(self, address): +        self._setup_socks4(address, is_4a=True) + +    def _socks5_auth(self): +        packet = compat_struct_pack('!B', SOCKS5_VERSION) + +        auth_methods = [Socks5Auth.AUTH_NONE] +        if self._proxy.username and self._proxy.password: +            auth_methods.append(Socks5Auth.AUTH_USER_PASS) + +        packet += compat_struct_pack('!B', len(auth_methods)) +        packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) + +        self.sendall(packet) + +        version, method = self._recv_bytes(2) + +        self._check_response_version(SOCKS5_VERSION, version) + +        if method == Socks5Auth.AUTH_NO_ACCEPTABLE: +            self.close() +            raise Socks5Error(method) + +        if method == Socks5Auth.AUTH_USER_PASS: +            username = self._proxy.username.encode('utf-8') +            password = self._proxy.password.encode('utf-8') +            packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) +            packet += self._len_and_data(username) + self._len_and_data(password) +            self.sendall(packet) + +            version, status = self._recv_bytes(2) + +            self._check_response_version(SOCKS5_USER_AUTH_VERSION, version) + +            if status != SOCKS5_USER_AUTH_SUCCESS: +                self.close() +                raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE) + +    def _setup_socks5(self, address): +        destaddr, port = address + +        ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) + +        self._socks5_auth() + +        reserved = 0 +        packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) +        if ipaddr is None: +            destaddr = destaddr.encode('utf-8') +            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) +            packet += self._len_and_data(destaddr) +        else: +            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr +        packet += compat_struct_pack('!H', port) + +        self.sendall(packet) + +        version, status, reserved, atype = self._recv_bytes(4) + +        self._check_response_version(SOCKS5_VERSION, version) + +        if status != Socks5Error.ERR_SUCCESS: +            self.close() +            raise Socks5Error(status) + +        if atype == Socks5AddressType.ATYP_IPV4: +            destaddr = self.recvall(4) +        elif atype == Socks5AddressType.ATYP_DOMAINNAME: +            alen = compat_ord(self.recv(1)) +            destaddr = self.recvall(alen) +        elif atype == Socks5AddressType.ATYP_IPV6: +            destaddr = self.recvall(16) +        destport = compat_struct_unpack('!H', self.recvall(2))[0] + +        return (destaddr, destport) + +    def _make_proxy(self, connect_func, address): +        if not self._proxy: +            return connect_func(self, address) + +        result = connect_func(self, (self._proxy.host, self._proxy.port)) +        if result != 0 and result is not None: +            return result +        setup_funcs = { +            ProxyType.SOCKS4: self._setup_socks4, +            ProxyType.SOCKS4A: self._setup_socks4a, +            ProxyType.SOCKS5: self._setup_socks5, +        } +        setup_funcs[self._proxy.type](address) +        return result + +    def connect(self, address): +        self._make_proxy(socket.socket.connect, address) + +    def connect_ex(self, address): +        return self._make_proxy(socket.socket.connect_ex, address) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 06c1d6cc1..7cf490aa4 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -4,10 +4,12 @@ import collections  import io  import zlib -from .compat import compat_str +from .compat import ( +    compat_str, +    compat_struct_unpack, +)  from .utils import (      ExtractorError, -    struct_unpack,  ) @@ -23,17 +25,17 @@ def _extract_tags(file_contents):              file_contents[:1])      # Determine number of bits in framesize rectangle -    framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3 +    framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3      framesize_len = (5 + 4 * framesize_nbits + 7) // 8      pos = framesize_len + 2 + 2      while pos < len(content): -        header16 = struct_unpack('<H', content[pos:pos + 2])[0] +        header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0]          pos += 2          tag_code = header16 >> 6          tag_len = header16 & 0x3f          if tag_len == 0x3f: -            tag_len = struct_unpack('<I', content[pos:pos + 4])[0] +            tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0]              pos += 4          assert pos + tag_len <= len(content), \              ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' @@ -101,7 +103,7 @@ def _read_int(reader):      for _ in range(5):          buf = reader.read(1)          assert len(buf) == 1 -        b = struct_unpack('<B', buf)[0] +        b = compat_struct_unpack('<B', buf)[0]          res = res | ((b & 0x7f) << shift)          if b & 0x80 == 0:              break @@ -127,7 +129,7 @@ def _s24(reader):      bs = reader.read(3)      assert len(bs) == 3      last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' -    return struct_unpack('<i', bs + last_byte)[0] +    return compat_struct_unpack('<i', bs + last_byte)[0]  def _read_string(reader): @@ -146,7 +148,7 @@ def _read_bytes(count, reader):  def _read_byte(reader):      resb = _read_bytes(1, reader=reader) -    res = struct_unpack('<B', resb)[0] +    res = compat_struct_unpack('<B', resb)[0]      return res diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 676ebe1c4..ebce9666a 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -83,11 +83,8 @@ def update_self(to_screen, verbose, opener):      print_notes(to_screen, versions_info['versions']) -    filename = sys.argv[0] -    # Py2EXE: Filename could be different -    if hasattr(sys, 'frozen') and not os.path.isfile(filename): -        if os.path.isfile(filename + '.exe'): -            filename += '.exe' +    # sys.executable is set to the full pathname of the exe-file for py2exe +    filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0]      if not os.access(filename, os.W_OK):          to_screen('ERROR: no write permissions on %s' % filename) @@ -95,7 +92,7 @@ def update_self(to_screen, verbose, opener):      # Py2EXE      if hasattr(sys, 'frozen'): -        exe = os.path.abspath(filename) +        exe = filename          directory = os.path.dirname(exe)          if not os.access(directory, os.W_OK):              to_screen('ERROR: no write permissions on %s' % directory) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7bcc85e2b..229de4b39 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils  import errno  import functools  import gzip -import itertools  import io +import itertools  import json  import locale  import math @@ -24,9 +24,8 @@ import os  import pipes  import platform  import re -import ssl  import socket -import struct +import ssl  import subprocess  import sys  import tempfile @@ -43,18 +42,34 @@ from .compat import (      compat_http_client,      compat_kwargs,      compat_parse_qs, +    compat_shlex_quote,      compat_socket_create_connection,      compat_str, +    compat_struct_pack,      compat_urllib_error,      compat_urllib_parse,      compat_urllib_parse_urlencode,      compat_urllib_parse_urlparse, +    compat_urllib_parse_unquote_plus,      compat_urllib_request,      compat_urlparse,      compat_xpath, -    shlex_quote,  ) +from .socks import ( +    ProxyType, +    sockssocket, +) + + +def register_socks_protocols(): +    # "Register" SOCKS protocols +    # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 +    # URLs with protocols not in urlparse.uses_netloc are not handled correctly +    for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): +        if scheme not in compat_urlparse.uses_netloc: +            compat_urlparse.uses_netloc.append(scheme) +  # This is not clearly defined otherwise  compiled_regex_type = type(re.compile('')) @@ -89,6 +104,11 @@ KNOWN_EXTENSIONS = (      'wav',      'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', +                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], +                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +  def preferredencoding():      """Get preferred encoding. @@ -251,9 +271,9 @@ def get_element_by_attribute(attribute, value, html):      m = re.search(r'''(?xs)          <([a-zA-Z0-9:._-]+) -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?           \s+%s=['"]?%s['"]? -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?          \s*>          (?P<content>.*?)          </\1> @@ -365,6 +385,8 @@ def sanitize_filename(s, restricted=False, is_id=False):      Set is_id if this is not an arbitrary string, but an ID that should be kept if possible      """      def replace_insane(char): +        if restricted and char in ACCENT_CHARS: +            return ACCENT_CHARS[char]          if char == '?' or ord(char) < 32 or ord(char) == 127:              return ''          elif char == '"': @@ -745,8 +767,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          self._params = params      def http_open(self, req): +        conn_class = compat_http_client.HTTPConnection + +        socks_proxy = req.headers.get('Ytdl-socks-proxy') +        if socks_proxy: +            conn_class = make_socks_conn_class(conn_class, socks_proxy) +            del req.headers['Ytdl-socks-proxy'] +          return self.do_open(functools.partial( -            _create_http_connection, self, compat_http_client.HTTPConnection, False), +            _create_http_connection, self, conn_class, False),              req)      @staticmethod @@ -832,9 +861,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):                  # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3                  if sys.version_info >= (3, 0):                      location = location.encode('iso-8859-1').decode('utf-8') +                else: +                    location = location.decode('utf-8')                  location_escaped = escape_url(location)                  if location != location_escaped:                      del resp.headers['Location'] +                    if sys.version_info < (3, 0): +                        location_escaped = location_escaped.encode('utf-8')                      resp.headers['Location'] = location_escaped          return resp @@ -842,6 +875,49 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      https_response = http_response +def make_socks_conn_class(base_class, socks_proxy): +    assert issubclass(base_class, ( +        compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + +    url_components = compat_urlparse.urlparse(socks_proxy) +    if url_components.scheme.lower() == 'socks5': +        socks_type = ProxyType.SOCKS5 +    elif url_components.scheme.lower() in ('socks', 'socks4'): +        socks_type = ProxyType.SOCKS4 +    elif url_components.scheme.lower() == 'socks4a': +        socks_type = ProxyType.SOCKS4A + +    def unquote_if_non_empty(s): +        if not s: +            return s +        return compat_urllib_parse_unquote_plus(s) + +    proxy_args = ( +        socks_type, +        url_components.hostname, url_components.port or 1080, +        True,  # Remote DNS +        unquote_if_non_empty(url_components.username), +        unquote_if_non_empty(url_components.password), +    ) + +    class SocksConnection(base_class): +        def connect(self): +            self.sock = sockssocket() +            self.sock.setproxy(*proxy_args) +            if type(self.timeout) in (int, float): +                self.sock.settimeout(self.timeout) +            self.sock.connect((self.host, self.port)) + +            if isinstance(self, compat_http_client.HTTPSConnection): +                if hasattr(self, '_context'):  # Python > 2.6 +                    self.sock = self._context.wrap_socket( +                        self.sock, server_hostname=self.host) +                else: +                    self.sock = ssl.wrap_socket(self.sock) + +    return SocksConnection + +  class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):      def __init__(self, params, https_conn_class=None, *args, **kwargs):          compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) @@ -850,12 +926,20 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):      def https_open(self, req):          kwargs = {} +        conn_class = self._https_conn_class +          if hasattr(self, '_context'):  # python > 2.6              kwargs['context'] = self._context          if hasattr(self, '_check_hostname'):  # python 3.x              kwargs['check_hostname'] = self._check_hostname + +        socks_proxy = req.headers.get('Ytdl-socks-proxy') +        if socks_proxy: +            conn_class = make_socks_conn_class(conn_class, socks_proxy) +            del req.headers['Ytdl-socks-proxy'] +          return self.do_open(functools.partial( -            _create_http_connection, self, self._https_conn_class, True), +            _create_http_connection, self, conn_class, True),              req, **kwargs) @@ -955,6 +1039,7 @@ def unified_strdate(date_str, day_first=True):          format_expressions.extend([              '%d-%m-%Y',              '%d.%m.%Y', +            '%d.%m.%y',              '%d/%m/%Y',              '%d/%m/%y',              '%d/%m/%Y %H:%M:%S', @@ -975,7 +1060,10 @@ def unified_strdate(date_str, day_first=True):      if upload_date is None:          timetuple = email.utils.parsedate_tz(date_str)          if timetuple: -            upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') +            try: +                upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') +            except ValueError: +                pass      if upload_date is not None:          return compat_str(upload_date) @@ -1186,7 +1274,7 @@ def bytes_to_intlist(bs):  def intlist_to_bytes(xs):      if not xs:          return b'' -    return struct_pack('%dB' % len(xs), *xs) +    return compat_struct_pack('%dB' % len(xs), *xs)  # Cross-platform file locking @@ -1469,15 +1557,11 @@ def setproctitle(title):  def remove_start(s, start): -    if s.startswith(start): -        return s[len(start):] -    return s +    return s[len(start):] if s is not None and s.startswith(start) else s  def remove_end(s, end): -    if s.endswith(end): -        return s[:-len(end)] -    return s +    return s[:-len(end)] if s is not None and s.endswith(end) else s  def remove_quotes(s): @@ -1754,24 +1838,6 @@ def escape_url(url):          fragment=escape_rfc3986(url_parsed.fragment)      ).geturl() -try: -    struct.pack('!I', 0) -except TypeError: -    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument -    # See https://bugs.python.org/issue19099 -    def struct_pack(spec, *args): -        if isinstance(spec, compat_str): -            spec = spec.encode('ascii') -        return struct.pack(spec, *args) - -    def struct_unpack(spec, *args): -        if isinstance(spec, compat_str): -            spec = spec.encode('ascii') -        return struct.unpack(spec, *args) -else: -    struct_pack = struct.pack -    struct_unpack = struct.unpack -  def read_batch_urls(batch_fd):      def fixup(url): @@ -1849,7 +1915,7 @@ def parse_age_limit(s):  def strip_jsonp(code):      return re.sub( -        r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) +        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)  def js_to_json(code): @@ -1857,24 +1923,38 @@ def js_to_json(code):          v = m.group(0)          if v in ('true', 'false', 'null'):              return v -        if v.startswith('"'): -            v = re.sub(r"\\'", "'", v[1:-1]) -        elif v.startswith("'"): -            v = v[1:-1] -            v = re.sub(r"\\\\|\\'|\"", lambda m: { -                '\\\\': '\\\\', -                "\\'": "'", +        elif v.startswith('/*') or v == ',': +            return "" + +        if v[0] in ("'", '"'): +            v = re.sub(r'(?s)\\.|"', lambda m: {                  '"': '\\"', -            }[m.group(0)], v) +                "\\'": "'", +                '\\\n': '', +                '\\x': '\\u00', +            }.get(m.group(0), m.group(0)), v[1:-1]) + +        INTEGER_TABLE = ( +            (r'^0[xX][0-9a-fA-F]+', 16), +            (r'^0+[0-7]+', 8), +        ) + +        for regex, base in INTEGER_TABLE: +            im = re.match(regex, v) +            if im: +                i = int(im.group(0), base) +                return '"%d":' % i if v.endswith(':') else '%d' % i +          return '"%s"' % v -    res = re.sub(r'''(?x) -        "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| -        '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| -        [a-zA-Z_][.a-zA-Z_0-9]* +    return re.sub(r'''(?sx) +        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| +        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| +        /\*.*?\*/|,(?=\s*[\]}])| +        [a-zA-Z_][.a-zA-Z_0-9]*| +        (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| +        [0-9]+(?=\s*:)          ''', fix_kv, code) -    res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) -    return res  def qualities(quality_ids): @@ -1922,7 +2002,7 @@ def ytdl_is_updateable():  def args_to_str(args):      # Get a short string representation for a subprocess command -    return ' '.join(shlex_quote(a) for a in args) +    return ' '.join(compat_shlex_quote(a) for a in args)  def error_to_compat_str(err): @@ -1940,6 +2020,9 @@ def mimetype2ext(mt):      ext = {          'audio/mp4': 'm4a', +        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as +        # it's the most popular one +        'audio/mpeg': 'mp3',      }.get(mt)      if ext is not None:          return ext @@ -1960,11 +2043,7 @@ def mimetype2ext(mt):  def urlhandle_detect_ext(url_handle): -    try: -        url_handle.headers -        getheader = lambda h: url_handle.headers[h] -    except AttributeError:  # Python < 3 -        getheader = url_handle.info().getheader +    getheader = url_handle.headers.get      cd = getheader('Content-Disposition')      if cd: @@ -2694,6 +2773,10 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):          if proxy == '__noproxy__':              return None  # No Proxy +        if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): +            req.add_header('Ytdl-socks-proxy', proxy) +            # youtube-dl's http/https handlers do wrapping the socket with socks +            return None          return compat_urllib_request.ProxyHandler.proxy_open(              self, req, proxy, type) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 551160897..d24d06f4a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2016.05.01' +__version__ = '2016.06.03'  | 
