diff options
Diffstat (limited to 'youtube_dl')
211 files changed, 8270 insertions, 3801 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 033b50702..bfb4ff225 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -26,6 +26,8 @@ import tokenize  import traceback  import random +from string import ascii_letters +  from .compat import (      compat_basestring,      compat_cookiejar, @@ -58,6 +60,7 @@ from .utils import (      format_bytes,      formatSeconds,      GeoRestrictedError, +    int_or_none,      ISO3166Utils,      locked_file,      make_HTTPS_handler, @@ -303,6 +306,17 @@ class YoutubeDL(object):                          postprocessor.      """ +    _NUMERIC_FIELDS = set(( +        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', +        'timestamp', 'upload_year', 'upload_month', 'upload_day', +        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', +        'average_rating', 'comment_count', 'age_limit', +        'start_time', 'end_time', +        'chapter_number', 'season_number', 'episode_number', +        'track_number', 'disc_number', 'release_year', +        'playlist_index', +    )) +      params = None      _ies = []      _pps = [] @@ -371,10 +385,10 @@ class YoutubeDL(object):                  else:                      raise -        if (sys.version_info >= (3,) and sys.platform != 'win32' and +        if (sys.platform != 'win32' and                  sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and                  not params.get('restrictfilenames', False)): -            # On Python 3, the Unicode filesystem API will throw errors (#1474) +            # Unicode filesystem API will throw errors (#1474, #13027)              self.report_warning(                  'Assuming --restrict-filenames since file system encoding '                  'cannot encode all characters. ' @@ -499,24 +513,25 @@ class YoutubeDL(object):      def to_console_title(self, message):          if not self.params.get('consoletitle', False):              return -        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): -            # c_wchar_p() might not be necessary if `message` is -            # already of type unicode() -            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) +        if compat_os_name == 'nt': +            if ctypes.windll.kernel32.GetConsoleWindow(): +                # c_wchar_p() might not be necessary if `message` is +                # already of type unicode() +                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))          elif 'TERM' in os.environ:              self._write_string('\033]0;%s\007' % message, self._screen_file)      def save_console_title(self):          if not self.params.get('consoletitle', False):              return -        if 'TERM' in os.environ: +        if compat_os_name != 'nt' and 'TERM' in os.environ:              # Save the title on stack              self._write_string('\033[22;0t', self._screen_file)      def restore_console_title(self):          if not self.params.get('consoletitle', False):              return -        if 'TERM' in os.environ: +        if compat_os_name != 'nt' and 'TERM' in os.environ:              # Restore the title from stack              self._write_string('\033[23;0t', self._screen_file) @@ -639,22 +654,11 @@ class YoutubeDL(object):                      r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],                      outtmpl) -            NUMERIC_FIELDS = set(( -                'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', -                'timestamp', 'upload_year', 'upload_month', 'upload_day', -                'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', -                'average_rating', 'comment_count', 'age_limit', -                'start_time', 'end_time', -                'chapter_number', 'season_number', 'episode_number', -                'track_number', 'disc_number', 'release_year', -                'playlist_index', -            )) -              # Missing numeric fields used together with integer presentation types              # in format specification will break the argument substitution since              # string 'NA' is returned for missing fields. We will patch output              # template for missing fields to meet string presentation type. -            for numeric_field in NUMERIC_FIELDS: +            for numeric_field in self._NUMERIC_FIELDS:                  if numeric_field not in template_dict:                      # As of [1] format syntax is:                      #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type @@ -673,7 +677,19 @@ class YoutubeDL(object):                          FORMAT_RE.format(numeric_field),                          r'%({0})s'.format(numeric_field), outtmpl) -            filename = expand_path(outtmpl % template_dict) +            # expand_path translates '%%' into '%' and '$$' into '$' +            # correspondingly that is not what we want since we need to keep +            # '%%' intact for template dict substitution step. Working around +            # with boundary-alike separator hack. +            sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) +            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + +            # outtmpl should be expand_path'ed before template dict substitution +            # because meta fields may contain env variables we don't want to +            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and +            # title "Hello $PATH", we don't want `$PATH` to be expanded. +            filename = expand_path(outtmpl).replace(sep, '') % template_dict +              # Temporary fix for #4787              # 'Treat' all problem characters by passing filename through preferredencoding              # to workaround encoding issues with subprocess on python2 @ Windows @@ -845,7 +861,7 @@ class YoutubeDL(object):              force_properties = dict(                  (k, v) for k, v in ie_result.items() if v is not None) -            for f in ('_type', 'url', 'ie_key'): +            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):                  if f in force_properties:                      del force_properties[f]              new_result = info.copy() @@ -1049,6 +1065,25 @@ class YoutubeDL(object):              return op(actual_value, comparison_value)          return _filter +    def _default_format_spec(self, info_dict, download=True): +        req_format_list = [] + +        def can_have_partial_formats(): +            if self.params.get('simulate', False): +                return True +            if not download: +                return True +            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': +                return False +            if info_dict.get('is_live'): +                return False +            merger = FFmpegMergerPP(self) +            return merger.available and merger.can_merge() +        if can_have_partial_formats(): +            req_format_list.append('bestvideo+bestaudio') +        req_format_list.append('best') +        return '/'.join(req_format_list) +      def build_format_selector(self, format_spec):          def syntax_error(note, start):              message = ( @@ -1345,9 +1380,28 @@ class YoutubeDL(object):          if 'title' not in info_dict:              raise ExtractorError('Missing "title" field in extractor result') -        if not isinstance(info_dict['id'], compat_str): -            self.report_warning('"id" field is not a string - forcing string conversion') -            info_dict['id'] = compat_str(info_dict['id']) +        def report_force_conversion(field, field_not, conversion): +            self.report_warning( +                '"%s" field is not %s - forcing %s conversion, there is an error in extractor' +                % (field, field_not, conversion)) + +        def sanitize_string_field(info, string_field): +            field = info.get(string_field) +            if field is None or isinstance(field, compat_str): +                return +            report_force_conversion(string_field, 'a string', 'string') +            info[string_field] = compat_str(field) + +        def sanitize_numeric_fields(info): +            for numeric_field in self._NUMERIC_FIELDS: +                field = info.get(numeric_field) +                if field is None or isinstance(field, compat_numeric_types): +                    continue +                report_force_conversion(numeric_field, 'numeric', 'int') +                info[numeric_field] = int_or_none(field) + +        sanitize_string_field(info_dict, 'id') +        sanitize_numeric_fields(info_dict)          if 'playlist' not in info_dict:              # It isn't part of a playlist @@ -1428,16 +1482,28 @@ class YoutubeDL(object):          if not formats:              raise ExtractorError('No video formats found!') +        def is_wellformed(f): +            url = f.get('url') +            if not url: +                self.report_warning( +                    '"url" field is missing or empty - skipping format, ' +                    'there is an error in extractor') +                return False +            if isinstance(url, bytes): +                sanitize_string_field(f, 'url') +            return True + +        # Filter out malformed formats for better extraction robustness +        formats = list(filter(is_wellformed, formats)) +          formats_dict = {}          # We check that all the formats have the format and format_id fields          for i, format in enumerate(formats): -            if 'url' not in format: -                raise ExtractorError('Missing "url" key in result (index %d)' % i) - +            sanitize_string_field(format, 'format_id') +            sanitize_numeric_fields(format)              format['url'] = sanitize_url(format['url']) - -            if format.get('format_id') is None: +            if not format.get('format_id'):                  format['format_id'] = compat_str(i)              else:                  # Sanitize format_id from characters used in format selector expression @@ -1490,14 +1556,10 @@ class YoutubeDL(object):          req_format = self.params.get('format')          if req_format is None: -            req_format_list = [] -            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and -                    not info_dict.get('is_live')): -                merger = FFmpegMergerPP(self) -                if merger.available and merger.can_merge(): -                    req_format_list.append('bestvideo+bestaudio') -            req_format_list.append('best') -            req_format = '/'.join(req_format_list) +            req_format = self._default_format_spec(info_dict, download=download) +            if self.params.get('verbose'): +                self.to_stdout('[debug] Default format spec: %s' % req_format) +          format_selector = self.build_format_selector(req_format)          # While in format selection we may need to have an access to the original @@ -1649,12 +1711,17 @@ class YoutubeDL(object):          if filename is None:              return -        try: -            dn = os.path.dirname(sanitize_path(encodeFilename(filename))) -            if dn and not os.path.exists(dn): -                os.makedirs(dn) -        except (OSError, IOError) as err: -            self.report_error('unable to create directory ' + error_to_compat_str(err)) +        def ensure_dir_exists(path): +            try: +                dn = os.path.dirname(path) +                if dn and not os.path.exists(dn): +                    os.makedirs(dn) +                return True +            except (OSError, IOError) as err: +                self.report_error('unable to create directory ' + error_to_compat_str(err)) +                return False + +        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):              return          if self.params.get('writedescription', False): @@ -1697,29 +1764,30 @@ class YoutubeDL(object):              ie = self.get_info_extractor(info_dict['extractor_key'])              for sub_lang, sub_info in subtitles.items():                  sub_format = sub_info['ext'] -                if sub_info.get('data') is not None: -                    sub_data = sub_info['data'] +                sub_filename = subtitles_filename(filename, sub_lang, sub_format) +                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): +                    self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))                  else: -                    try: -                        sub_data = ie._download_webpage( -                            sub_info['url'], info_dict['id'], note=False) -                    except ExtractorError as err: -                        self.report_warning('Unable to download subtitle for "%s": %s' % -                                            (sub_lang, error_to_compat_str(err.cause))) -                        continue -                try: -                    sub_filename = subtitles_filename(filename, sub_lang, sub_format) -                    if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): -                        self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) +                    self.to_screen('[info] Writing video subtitles to: ' + sub_filename) +                    if sub_info.get('data') is not None: +                        try: +                            # Use newline='' to prevent conversion of newline characters +                            # See https://github.com/rg3/youtube-dl/issues/10268 +                            with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: +                                subfile.write(sub_info['data']) +                        except (OSError, IOError): +                            self.report_error('Cannot write subtitles file ' + sub_filename) +                            return                      else: -                        self.to_screen('[info] Writing video subtitles to: ' + sub_filename) -                        # Use newline='' to prevent conversion of newline characters -                        # See https://github.com/rg3/youtube-dl/issues/10268 -                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: -                            subfile.write(sub_data) -                except (OSError, IOError): -                    self.report_error('Cannot write subtitles file ' + sub_filename) -                    return +                        try: +                            sub_data = ie._request_webpage( +                                sub_info['url'], info_dict['id'], note=False).read() +                            with io.open(encodeFilename(sub_filename), 'wb') as subfile: +                                subfile.write(sub_data) +                        except (ExtractorError, IOError, OSError, ValueError) as err: +                            self.report_warning('Unable to download subtitle for "%s": %s' % +                                                (sub_lang, error_to_compat_str(err))) +                            continue          if self.params.get('writeinfojson', False):              infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) @@ -1792,8 +1860,11 @@ class YoutubeDL(object):                          for f in requested_formats:                              new_info = dict(info_dict)                              new_info.update(f) -                            fname = self.prepare_filename(new_info) -                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) +                            fname = prepend_extension( +                                self.prepare_filename(new_info), +                                'f%s' % f['format_id'], new_info['ext']) +                            if not ensure_dir_exists(fname): +                                return                              downloaded.append(fname)                              partial_success = dl(fname, new_info)                              success = success and partial_success @@ -1860,7 +1931,7 @@ class YoutubeDL(object):                          info_dict.get('protocol') == 'm3u8' and                          self.params.get('hls_prefer_native')):                      if fixup_policy == 'warn': -                        self.report_warning('%s: malformated aac bitstream.' % ( +                        self.report_warning('%s: malformed AAC bitstream detected.' % (                              info_dict['id']))                      elif fixup_policy == 'detect_or_warn':                          fixup_pp = FFmpegFixupM3u8PP(self) @@ -1869,7 +1940,7 @@ class YoutubeDL(object):                              info_dict['__postprocessors'].append(fixup_pp)                          else:                              self.report_warning( -                                '%s: malformated aac bitstream. %s' +                                '%s: malformed AAC bitstream detected. %s'                                  % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))                      else:                          assert fixup_policy in ('ignore', 'never') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 39527117f..9e4e13bcf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2322,6 +2322,19 @@ try:  except ImportError:  # Python 2      from HTMLParser import HTMLParser as compat_HTMLParser +try:  # Python 2 +    from HTMLParser import HTMLParseError as compat_HTMLParseError +except ImportError:  # Python <3.4 +    try: +        from html.parser import HTMLParseError as compat_HTMLParseError +    except ImportError:  # Python >3.4 + +        # HTMLParseError has been deprecated in Python 3.3 and removed in +        # Python 3.5. Introducing dummy exception for Python >3.5 for compatible +        # and uniform cross-version exceptiong handling +        class compat_HTMLParseError(Exception): +            pass +  try:      from subprocess import DEVNULL      compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -2604,14 +2617,22 @@ except ImportError:  # Python 2                  parsed_result[name] = [value]          return parsed_result -try: -    from shlex import quote as compat_shlex_quote -except ImportError:  # Python < 3.3 + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt':      def compat_shlex_quote(s): -        if re.match(r'^[-_\w./]+$', s): -            return s -        else: -            return "'" + s.replace("'", "'\"'\"'") + "'" +        return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: +    try: +        from shlex import quote as compat_shlex_quote +    except ImportError:  # Python < 3.3 +        def compat_shlex_quote(s): +            if re.match(r'^[-_\w./]+$', s): +                return s +            else: +                return "'" + s.replace("'", "'\"'\"'") + "'"  try: @@ -2636,9 +2657,6 @@ def compat_ord(c):          return ord(c) -compat_os_name = os._name if os.name == 'java' else os.name - -  if sys.version_info >= (3, 0):      compat_getenv = os.getenv      compat_expanduser = os.path.expanduser @@ -2882,6 +2900,7 @@ else:  __all__ = [ +    'compat_HTMLParseError',      'compat_HTMLParser',      'compat_HTTPError',      'compat_basestring', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5d6621147..75b8166c5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -8,10 +8,11 @@ import random  from ..compat import compat_os_name  from ..utils import ( +    decodeArgument,      encodeFilename,      error_to_compat_str, -    decodeArgument,      format_bytes, +    shell_quote,      timeconvert,  ) @@ -303,11 +304,11 @@ class FileDownloader(object):          """Report attempt to resume at given byte."""          self.to_screen('[download] Resuming download at byte %s' % resume_len) -    def report_retry(self, count, retries): +    def report_retry(self, err, count, retries):          """Report retry in case of HTTP error 5xx"""          self.to_screen( -            '[download] Got server HTTP error. Retrying (attempt %d of %s)...' -            % (count, self.format_retries(retries))) +            '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' +            % (error_to_compat_str(err), count, self.format_retries(retries)))      def report_file_already_downloaded(self, file_name):          """Report file has already been fully downloaded.""" @@ -381,10 +382,5 @@ class FileDownloader(object):          if exe is None:              exe = os.path.basename(str_args[0]) -        try: -            import pipes -            shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) -        except ImportError: -            shell_quote = repr          self.to_screen('[debug] %s command line: %s' % (              exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 7491fdad8..576ece6db 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals  from .fragment import FragmentFD  from ..compat import compat_urllib_error +from ..utils import urljoin  class DashSegmentsFD(FragmentFD): @@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD):      FD_NAME = 'dashsegments'      def real_download(self, filename, info_dict): -        segments = info_dict['fragments'][:1] if self.params.get( +        fragment_base_url = info_dict.get('fragment_base_url') +        fragments = info_dict['fragments'][:1] if self.params.get(              'test', False) else info_dict['fragments']          ctx = {              'filename': filename, -            'total_frags': len(segments), +            'total_frags': len(fragments),          }          self._prepare_and_start_frag_download(ctx) @@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD):          skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)          frag_index = 0 -        for i, segment in enumerate(segments): +        for i, fragment in enumerate(fragments):              frag_index += 1              if frag_index <= ctx['fragment_index']:                  continue @@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD):              count = 0              while count <= fragment_retries:                  try: -                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) +                    fragment_url = fragment.get('url') +                    if not fragment_url: +                        assert fragment_base_url +                        fragment_url = urljoin(fragment_base_url, fragment['path']) +                    success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)                      if not success:                          return False                      self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e78169a0d..db018fa89 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -212,6 +212,11 @@ class FFmpegFD(ExternalFD):          args = [ffpp.executable, '-y'] +        for log_level in ('quiet', 'verbose'): +            if self.params.get(log_level, False): +                args += ['-loglevel', log_level] +                break +          seekable = info_dict.get('_seekable')          if seekable is not None:              # setting -seekable prevents ffmpeg from guessing if the server diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ecc1..6f6fb4a77 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader):          if self.__do_ytdl_file(ctx):              if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):                  self._read_ytdl_file(ctx) +                if ctx['fragment_index'] > 0 and resume_len == 0: +                    self.report_error( +                        'Inconsistent state of incomplete fragment download. ' +                        'Restarting from the beginning...') +                    ctx['fragment_index'] = resume_len = 0 +                    self._write_ytdl_file(ctx)              else:                  self._write_ytdl_file(ctx) -            if ctx['fragment_index'] > 0: -                assert resume_len > 0 +                assert ctx['fragment_index'] == 0          dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 0e29c8a2a..46308cf07 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -59,9 +59,9 @@ class HlsFD(FragmentFD):          man_url = info_dict['url']          self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) -        manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read() - -        s = manifest.decode('utf-8', 'ignore') +        urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) +        man_url = urlh.geturl() +        s = urlh.read().decode('utf-8', 'ignore')          if not self.can_download(s, info_dict):              if info_dict.get('extra_param_to_segment_url'): diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..8a6638cc2 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -22,8 +22,16 @@ from ..utils import (  class HttpFD(FileDownloader):      def real_download(self, filename, info_dict):          url = info_dict['url'] -        tmpfilename = self.temp_name(filename) -        stream = None + +        class DownloadContext(dict): +            __getattr__ = dict.get +            __setattr__ = dict.__setitem__ +            __delattr__ = dict.__delitem__ + +        ctx = DownloadContext() +        ctx.filename = filename +        ctx.tmpfilename = self.temp_name(filename) +        ctx.stream = None          # Do not include the Accept-Encoding header          headers = {'Youtubedl-no-compression': 'True'} @@ -38,46 +46,51 @@ class HttpFD(FileDownloader):          if is_test:              request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) -        # Establish possible resume length -        if os.path.isfile(encodeFilename(tmpfilename)): -            resume_len = os.path.getsize(encodeFilename(tmpfilename)) -        else: -            resume_len = 0 - -        open_mode = 'wb' -        if resume_len != 0: -            if self.params.get('continuedl', True): -                self.report_resuming_byte(resume_len) -                request.add_header('Range', 'bytes=%d-' % resume_len) -                open_mode = 'ab' -            else: -                resume_len = 0 +        ctx.open_mode = 'wb' +        ctx.resume_len = 0 + +        if self.params.get('continuedl', True): +            # Establish possible resume length +            if os.path.isfile(encodeFilename(ctx.tmpfilename)): +                ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))          count = 0          retries = self.params.get('retries', 0) -        while count <= retries: + +        class SucceedDownload(Exception): +            pass + +        class RetryDownload(Exception): +            def __init__(self, source_error): +                self.source_error = source_error + +        def establish_connection(): +            if ctx.resume_len != 0: +                self.report_resuming_byte(ctx.resume_len) +                request.add_header('Range', 'bytes=%d-' % ctx.resume_len) +                ctx.open_mode = 'ab'              # Establish connection              try: -                data = self.ydl.urlopen(request) +                ctx.data = self.ydl.urlopen(request)                  # When trying to resume, Content-Range HTTP header of response has to be checked                  # to match the value of requested Range HTTP header. This is due to a webservers                  # that don't support resuming and serve a whole file with no Content-Range                  # set in response despite of requested Range (see                  # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) -                if resume_len > 0: -                    content_range = data.headers.get('Content-Range') +                if ctx.resume_len > 0: +                    content_range = ctx.data.headers.get('Content-Range')                      if content_range:                          content_range_m = re.search(r'bytes (\d+)-', content_range)                          # Content-Range is present and matches requested Range, resume is possible -                        if content_range_m and resume_len == int(content_range_m.group(1)): -                            break +                        if content_range_m and ctx.resume_len == int(content_range_m.group(1)): +                            return                      # Content-Range is either not present or invalid. Assuming remote webserver is                      # trying to send the whole file, resume is not possible, so wiping the local file                      # and performing entire redownload                      self.report_unable_to_resume() -                    resume_len = 0 -                    open_mode = 'wb' -                break +                    ctx.resume_len = 0 +                    ctx.open_mode = 'wb' +                return              except (compat_urllib_error.HTTPError, ) as err:                  if (err.code < 500 or err.code >= 600) and err.code != 416:                      # Unexpected HTTP error @@ -86,15 +99,15 @@ class HttpFD(FileDownloader):                      # Unable to resume (requested range not satisfiable)                      try:                          # Open the connection again without the range header -                        data = self.ydl.urlopen(basic_request) -                        content_length = data.info()['Content-Length'] +                        ctx.data = self.ydl.urlopen(basic_request) +                        content_length = ctx.data.info()['Content-Length']                      except (compat_urllib_error.HTTPError, ) as err:                          if err.code < 500 or err.code >= 600:                              raise                      else:                          # Examine the reported length                          if (content_length is not None and -                                (resume_len - 100 < int(content_length) < resume_len + 100)): +                                (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):                              # The file had already been fully downloaded.                              # Explanation to the above condition: in issue #175 it was revealed that                              # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +115,184 @@ class HttpFD(FileDownloader):                              # I decided to implement a suggested change and consider the file                              # completely downloaded if the file size differs less than 100 bytes from                              # the one in the hard drive. -                            self.report_file_already_downloaded(filename) -                            self.try_rename(tmpfilename, filename) +                            self.report_file_already_downloaded(ctx.filename) +                            self.try_rename(ctx.tmpfilename, ctx.filename)                              self._hook_progress({ -                                'filename': filename, +                                'filename': ctx.filename,                                  'status': 'finished', -                                'downloaded_bytes': resume_len, -                                'total_bytes': resume_len, +                                'downloaded_bytes': ctx.resume_len, +                                'total_bytes': ctx.resume_len,                              }) -                            return True +                            raise SucceedDownload()                          else:                              # The length does not match, we start the download over                              self.report_unable_to_resume() -                            resume_len = 0 -                            open_mode = 'wb' -                            break -            except socket.error as e: -                if e.errno != errno.ECONNRESET: +                            ctx.resume_len = 0 +                            ctx.open_mode = 'wb' +                            return +                raise RetryDownload(err) +            except socket.error as err: +                if err.errno != errno.ECONNRESET:                      # Connection reset is no problem, just retry                      raise +                raise RetryDownload(err) + +        def download(): +            data_len = ctx.data.info().get('Content-length', None) + +            # Range HTTP header may be ignored/unsupported by a webserver +            # (e.g. extractor/scivee.py, extractor/bambuser.py). +            # However, for a test we still would like to download just a piece of a file. +            # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control +            # block size when downloading a file. +            if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): +                data_len = self._TEST_FILE_SIZE + +            if data_len is not None: +                data_len = int(data_len) + ctx.resume_len +                min_data_len = self.params.get('min_filesize') +                max_data_len = self.params.get('max_filesize') +                if min_data_len is not None and data_len < min_data_len: +                    self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) +                    return False +                if max_data_len is not None and data_len > max_data_len: +                    self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) +                    return False -            # Retry -            count += 1 -            if count <= retries: -                self.report_retry(count, retries) - -        if count > retries: -            self.report_error('giving up after %s retries' % retries) -            return False - -        data_len = data.info().get('Content-length', None) - -        # Range HTTP header may be ignored/unsupported by a webserver -        # (e.g. extractor/scivee.py, extractor/bambuser.py). -        # However, for a test we still would like to download just a piece of a file. -        # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control -        # block size when downloading a file. -        if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): -            data_len = self._TEST_FILE_SIZE - -        if data_len is not None: -            data_len = int(data_len) + resume_len -            min_data_len = self.params.get('min_filesize') -            max_data_len = self.params.get('max_filesize') -            if min_data_len is not None and data_len < min_data_len: -                self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) -                return False -            if max_data_len is not None and data_len > max_data_len: -                self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) -                return False - -        byte_counter = 0 + resume_len -        block_size = self.params.get('buffersize', 1024) -        start = time.time() +            byte_counter = 0 + ctx.resume_len +            block_size = self.params.get('buffersize', 1024) +            start = time.time() -        # measure time over whole while-loop, so slow_down() and best_block_size() work together properly -        now = None  # needed for slow_down() in the first loop run -        before = start  # start measuring -        while True: +            # measure time over whole while-loop, so slow_down() and best_block_size() work together properly +            now = None  # needed for slow_down() in the first loop run +            before = start  # start measuring -            # Download and write -            data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) -            byte_counter += len(data_block) +            def retry(e): +                if ctx.tmpfilename != '-': +                    ctx.stream.close() +                ctx.stream = None +                ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) +                raise RetryDownload(e) -            # exit loop when download is finished -            if len(data_block) == 0: -                break +            while True: +                try: +                    # Download and write +                    data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) +                # socket.timeout is a subclass of socket.error but may not have +                # errno set +                except socket.timeout as e: +                    retry(e) +                except socket.error as e: +                    if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): +                        raise +                    retry(e) + +                byte_counter += len(data_block) + +                # exit loop when download is finished +                if len(data_block) == 0: +                    break + +                # Open destination file just in time +                if ctx.stream is None: +                    try: +                        ctx.stream, ctx.tmpfilename = sanitize_open( +                            ctx.tmpfilename, ctx.open_mode) +                        assert ctx.stream is not None +                        ctx.filename = self.undo_temp_name(ctx.tmpfilename) +                        self.report_destination(ctx.filename) +                    except (OSError, IOError) as err: +                        self.report_error('unable to open for writing: %s' % str(err)) +                        return False + +                    if self.params.get('xattr_set_filesize', False) and data_len is not None: +                        try: +                            write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) +                        except (XAttrUnavailableError, XAttrMetadataError) as err: +                            self.report_error('unable to set filesize xattr: %s' % str(err)) -            # Open destination file just in time -            if stream is None:                  try: -                    (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) -                    assert stream is not None -                    filename = self.undo_temp_name(tmpfilename) -                    self.report_destination(filename) -                except (OSError, IOError) as err: -                    self.report_error('unable to open for writing: %s' % str(err)) +                    ctx.stream.write(data_block) +                except (IOError, OSError) as err: +                    self.to_stderr('\n') +                    self.report_error('unable to write data: %s' % str(err))                      return False -                if self.params.get('xattr_set_filesize', False) and data_len is not None: -                    try: -                        write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) -                    except (XAttrUnavailableError, XAttrMetadataError) as err: -                        self.report_error('unable to set filesize xattr: %s' % str(err)) - -            try: -                stream.write(data_block) -            except (IOError, OSError) as err: +                # Apply rate limit +                self.slow_down(start, now, byte_counter - ctx.resume_len) + +                # end measuring of one loop run +                now = time.time() +                after = now + +                # Adjust block size +                if not self.params.get('noresizebuffer', False): +                    block_size = self.best_block_size(after - before, len(data_block)) + +                before = after + +                # Progress message +                speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) +                if data_len is None: +                    eta = None +                else: +                    eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + +                self._hook_progress({ +                    'status': 'downloading', +                    'downloaded_bytes': byte_counter, +                    'total_bytes': data_len, +                    'tmpfilename': ctx.tmpfilename, +                    'filename': ctx.filename, +                    'eta': eta, +                    'speed': speed, +                    'elapsed': now - start, +                }) + +                if is_test and byte_counter == data_len: +                    break + +            if ctx.stream is None:                  self.to_stderr('\n') -                self.report_error('unable to write data: %s' % str(err)) +                self.report_error('Did not get any data blocks')                  return False +            if ctx.tmpfilename != '-': +                ctx.stream.close() -            # Apply rate limit -            self.slow_down(start, now, byte_counter - resume_len) +            if data_len is not None and byte_counter != data_len: +                err = ContentTooShortError(byte_counter, int(data_len)) +                if count <= retries: +                    retry(err) +                raise err -            # end measuring of one loop run -            now = time.time() -            after = now +            self.try_rename(ctx.tmpfilename, ctx.filename) -            # Adjust block size -            if not self.params.get('noresizebuffer', False): -                block_size = self.best_block_size(after - before, len(data_block)) - -            before = after - -            # Progress message -            speed = self.calc_speed(start, now, byte_counter - resume_len) -            if data_len is None: -                eta = None -            else: -                eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) +            # Update file modification time +            if self.params.get('updatetime', True): +                info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))              self._hook_progress({ -                'status': 'downloading',                  'downloaded_bytes': byte_counter, -                'total_bytes': data_len, -                'tmpfilename': tmpfilename, -                'filename': filename, -                'eta': eta, -                'speed': speed, -                'elapsed': now - start, +                'total_bytes': byte_counter, +                'filename': ctx.filename, +                'status': 'finished', +                'elapsed': time.time() - start,              }) -            if is_test and byte_counter == data_len: -                break - -        if stream is None: -            self.to_stderr('\n') -            self.report_error('Did not get any data blocks') -            return False -        if tmpfilename != '-': -            stream.close() - -        if data_len is not None and byte_counter != data_len: -            raise ContentTooShortError(byte_counter, int(data_len)) -        self.try_rename(tmpfilename, filename) - -        # Update file modification time -        if self.params.get('updatetime', True): -            info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - -        self._hook_progress({ -            'downloaded_bytes': byte_counter, -            'total_bytes': byte_counter, -            'filename': filename, -            'status': 'finished', -            'elapsed': time.time() - start, -        }) - -        return True +            return True + +        while count <= retries: +            try: +                establish_connection() +                download() +                return True +            except RetryDownload as e: +                count += 1 +                if count <= retries: +                    self.report_retry(e.source_error, count, retries) +                continue +            except SucceedDownload: +                return True + +        self.report_error('giving up after %s retries' % retries) +        return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 5f6f9faef..9b001ecff 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -98,7 +98,7 @@ def write_piff_header(stream, params):      if is_audio:          smhd_payload = s88.pack(0)  # balance -        smhd_payload = u16.pack(0)  # reserved +        smhd_payload += u16.pack(0)  # reserved          media_header_box = full_box(b'smhd', 0, 0, smhd_payload)  # Sound Media Header      else:          vmhd_payload = u16.pack(0)  # graphics mode @@ -126,7 +126,6 @@ def write_piff_header(stream, params):          if fourcc == 'AACL':              sample_entry_box = box(b'mp4a', sample_entry_payload)      else: -        sample_entry_payload = sample_entry_payload          sample_entry_payload += u16.pack(0)  # pre defined          sample_entry_payload += u16.pack(0)  # reserved          sample_entry_payload += u32.pack(0) * 3  # pre defined diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 0247cabf9..60f753b95 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      ExtractorError,      js_to_json,      int_or_none,      parse_iso8601, +    try_get,  ) @@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor):          title = video_params.get('title') or video_params['seriesTitle']          stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') -        formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) +        format_urls = [ +            try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + +        # May have higher quality video +        sd_url = try_get( +            stream, lambda x: x['streams']['hds']['sd'], compat_str) +        if sd_url: +            format_urls.append(sd_url.replace('metered', 'um')) + +        formats = [] +        for format_url in format_urls: +            if format_url: +                formats.extend( +                    self._extract_akamai_formats(format_url, video_id))          self._sort_formats(formats)          subtitles = {} diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 4f56c4c11..f770fe901 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,12 +7,21 @@ import time  from .amp import AMPIE  from .common import InfoExtractor +from .youtube import YoutubeIE  from ..compat import compat_urlparse  class AbcNewsVideoIE(AMPIE):      IE_NAME = 'abcnews:video' -    _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        abcnews\.go\.com/ +                        (?: +                            [^/]+/video/(?P<display_id>[0-9a-z-]+)-| +                            video/embed\?.*?\bid= +                        ) +                        (?P<id>\d+) +                    '''      _TESTS = [{          'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', @@ -30,6 +39,9 @@ class AbcNewsVideoIE(AMPIE):              'skip_download': True,          },      }, { +        'url': 'http://abcnews.go.com/video/embed?id=46979033', +        'only_matching': True, +    }, {          'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',          'only_matching': True,      }] @@ -97,9 +109,7 @@ class AbcNewsIE(InfoExtractor):              r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')          full_video_url = compat_urlparse.urljoin(url, video_url) -        youtube_url = self._html_search_regex( -            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', -            webpage, 'YouTube URL', default=None) +        youtube_url = YoutubeIE._extract_url(webpage)          timestamp = None          date_str = self._html_search_regex( @@ -129,7 +139,7 @@ class AbcNewsIE(InfoExtractor):          }          if youtube_url: -            entries = [entry, self.url_result(youtube_url, 'Youtube')] +            entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]              return self.playlist_result(entries)          return entry diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 76e98132b..03b92a39c 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -22,7 +22,7 @@ class ABCOTVSIE(InfoExtractor):                  'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',                  'ext': 'mp4',                  'title': 'East Bay museum celebrates vintage synthesizers', -                'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', +                'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',                  'thumbnail': r're:^https?://.*\.jpg$',                  'timestamp': 1421123075,                  'upload_date': '20150113', diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 66caf6a81..cffdab6ca 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -15,6 +15,7 @@ from ..utils import (      intlist_to_bytes,      srt_subtitles_timecode,      strip_or_none, +    urljoin,  ) @@ -31,25 +32,28 @@ class ADNIE(InfoExtractor):              'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',          }      } +    _BASE_URL = 'http://animedigitalnetwork.fr'      def _get_subtitles(self, sub_path, video_id):          if not sub_path:              return None          enc_subtitles = self._download_webpage( -            'http://animedigitalnetwork.fr/' + sub_path, -            video_id, fatal=False) +            urljoin(self._BASE_URL, sub_path), +            video_id, fatal=False, headers={ +                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', +            })          if not enc_subtitles:              return None          # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js          dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(              bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), -            bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'), +            bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'),              bytes_to_intlist(base64.b64decode(enc_subtitles[:24]))          ))          subtitles_json = self._parse_json( -            dec_subtitles[:-compat_ord(dec_subtitles[-1])], +            dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),              None, fatal=False)          if not subtitles_json:              return None @@ -103,9 +107,18 @@ class ADNIE(InfoExtractor):          metas = options.get('metas') or {}          title = metas.get('title') or video_info['title']          links = player_config.get('links') or {} +        error = None +        if not links: +            links_url = player_config['linksurl'] +            links_data = self._download_json(urljoin( +                self._BASE_URL, links_url), video_id) +            links = links_data.get('links') or {} +            error = links_data.get('error')          formats = []          for format_id, qualities in links.items(): +            if not isinstance(qualities, dict): +                continue              for load_balancer_url in qualities.values():                  load_balancer_data = self._download_json(                      load_balancer_url, video_id, fatal=False) or {} @@ -119,7 +132,8 @@ class ADNIE(InfoExtractor):                      for f in m3u8_formats:                          f['language'] = 'fr'                  formats.extend(m3u8_formats) -        error = options.get('error') +        if not error: +            error = options.get('error')          if not formats and error:              raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 7da96c65c..b83b51efb 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -6,12 +6,16 @@ import time  import xml.etree.ElementTree as etree  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( +    compat_kwargs, +    compat_urlparse, +)  from ..utils import (      unescapeHTML,      urlencode_postdata,      unified_timestamp,      ExtractorError, +    NO_DEFAULT,  ) @@ -21,6 +25,11 @@ MSO_INFO = {          'username_field': 'username',          'password_field': 'password',      }, +    'ATTOTT': { +        'name': 'DIRECTV NOW', +        'username_field': 'email', +        'password_field': 'loginpassword', +    },      'Rogers': {          'name': 'Rogers',          'username_field': 'UserName', @@ -36,6 +45,11 @@ MSO_INFO = {          'username_field': 'Ecom_User_ID',          'password_field': 'Ecom_Password',      }, +    'Brighthouse': { +        'name': 'Bright House Networks | Spectrum', +        'username_field': 'j_username', +        'password_field': 'j_password', +    },      'Charter_Direct': {          'name': 'Charter Spectrum',          'username_field': 'IDToken1', @@ -1308,11 +1322,14 @@ class AdobePassIE(InfoExtractor):      _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'      _MVPD_CACHE = 'ap-mvpd' +    _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' +      def _download_webpage_handle(self, *args, **kwargs):          headers = kwargs.get('headers', {})          headers.update(self.geo_verification_headers())          kwargs['headers'] = headers -        return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) +        return super(AdobePassIE, self)._download_webpage_handle( +            *args, **compat_kwargs(kwargs))      @staticmethod      def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1356,6 +1373,21 @@ class AdobePassIE(InfoExtractor):                  'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier '                  'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) +        def extract_redirect_url(html, url=None, fatal=False): +            # TODO: eliminate code duplication with generic extractor and move +            # redirection code into _download_webpage_handle +            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' +            redirect_url = self._search_regex( +                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' +                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, +                html, 'meta refresh redirect', +                default=NO_DEFAULT if fatal else None, fatal=fatal) +            if not redirect_url: +                return None +            if url: +                redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) +            return redirect_url +          mvpd_headers = {              'ap_42': 'anonymous',              'ap_11': 'Linux i686', @@ -1405,16 +1437,15 @@ class AdobePassIE(InfoExtractor):                          if '<form name="signin"' in provider_redirect_page:                              provider_login_page_res = provider_redirect_page_res                          elif 'http-equiv="refresh"' in provider_redirect_page: -                            oauth_redirect_url = self._html_search_regex( -                                r'content="0;\s*url=([^\'"]+)', -                                provider_redirect_page, 'meta refresh redirect') +                            oauth_redirect_url = extract_redirect_url( +                                provider_redirect_page, fatal=True)                              provider_login_page_res = self._download_webpage_handle(                                  oauth_redirect_url, video_id, -                                'Downloading Provider Login Page') +                                self._DOWNLOADING_LOGIN_PAGE)                          else:                              provider_login_page_res = post_form(                                  provider_redirect_page_res, -                                'Downloading Provider Login Page') +                                self._DOWNLOADING_LOGIN_PAGE)                          mvpd_confirm_page_res = post_form(                              provider_login_page_res, 'Logging in', { @@ -1461,8 +1492,17 @@ class AdobePassIE(InfoExtractor):                              'Content-Type': 'application/x-www-form-urlencoded'                          })                  else: +                    # Some providers (e.g. DIRECTV NOW) have another meta refresh +                    # based redirect that should be followed. +                    provider_redirect_page, urlh = provider_redirect_page_res +                    provider_refresh_redirect_url = extract_redirect_url( +                        provider_redirect_page, url=urlh.geturl()) +                    if provider_refresh_redirect_url: +                        provider_redirect_page_res = self._download_webpage_handle( +                            provider_refresh_redirect_url, video_id, +                            'Downloading Provider Redirect Page (meta refresh)')                      provider_login_page_res = post_form( -                        provider_redirect_page_res, 'Downloading Provider Login Page') +                        provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)                      mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {                          mso_info.get('username_field', 'username'): username,                          mso_info.get('password_field', 'password'): password, diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 989505c82..acc4ce38d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,91 +5,52 @@ import re  from .turner import TurnerBaseIE  from ..utils import ( -    ExtractorError,      int_or_none, +    strip_or_none,  )  class AdultSwimIE(TurnerBaseIE): -    _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' +    _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'      _TESTS = [{          'url': 'http://adultswim.com/videos/rick-and-morty/pilot', -        'playlist': [ -            { -                'md5': '247572debc75c7652f253c8daa51a14d', -                'info_dict': { -                    'id': 'rQxZvXQ4ROaSOqq-or2Mow-0', -                    'ext': 'flv', -                    'title': 'Rick and Morty - Pilot Part 1', -                    'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " -                }, -            }, -            { -                'md5': '77b0e037a4b20ec6b98671c4c379f48d', -                'info_dict': { -                    'id': 'rQxZvXQ4ROaSOqq-or2Mow-3', -                    'ext': 'flv', -                    'title': 'Rick and Morty - Pilot Part 4', -                    'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " -                }, -            }, -        ],          'info_dict': {              'id': 'rQxZvXQ4ROaSOqq-or2Mow', +            'ext': 'mp4',              'title': 'Rick and Morty - Pilot', -            'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " +            'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', +            'timestamp': 1493267400, +            'upload_date': '20170427',          }, -        'skip': 'This video is only available for registered users', -    }, { -        'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', -        'playlist': [ -            { -                'md5': '2eb5c06d0f9a1539da3718d897f13ec5', -                'info_dict': { -                    'id': '-t8CamQlQ2aYZ49ItZCFog-0', -                    'ext': 'flv', -                    'title': 'American Dad - Putting Francine Out of Business', -                    'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' -                }, -            } -        ], -        'info_dict': { -            'id': '-t8CamQlQ2aYZ49ItZCFog', -            'title': 'American Dad - Putting Francine Out of Business', -            'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' +        'params': { +            # m3u8 download +            'skip_download': True,          }, +        'expected_warnings': ['Unable to download f4m manifest'],      }, {          'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', -        'playlist': [ -            { -                'md5': '3e346a2ab0087d687a05e1e7f3b3e529', -                'info_dict': { -                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', -                    'ext': 'mp4', -                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', -                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', -                }, -            } -        ],          'info_dict': {              'id': 'sY3cMUR_TbuE4YmdjzbIcQ', +            'ext': 'mp4',              'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', -            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', +            'upload_date': '20080124', +            'timestamp': 1201150800,          },          'params': {              # m3u8 download              'skip_download': True, -        } +        },      }, { -        # heroMetadata.trailer          'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',          'info_dict': {              'id': 'I0LQFQkaSUaFp8PnAWHhoQ',              'ext': 'mp4',              'title': 'Decker - Inside Decker: A New Hero', -            'description': 'md5:c916df071d425d62d70c86d4399d3ee0', -            'duration': 249.008, +            'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', +            'timestamp': 1469480460, +            'upload_date': '20160725',          },          'params': {              # m3u8 download @@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE):          },          'expected_warnings': ['Unable to download f4m manifest'],      }, { -        'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', +        'url': 'http://www.adultswim.com/videos/attack-on-titan', +        'info_dict': { +            'id': 'b7A69dzfRzuaXIECdxW8XQ', +            'title': 'Attack on Titan', +            'description': 'md5:6c8e003ea0777b47013e894767f5e114', +        }, +        'playlist_mincount': 12, +    }, { +        'url': 'http://www.adultswim.com/videos/streams/williams-stream',          'info_dict': { -            'id': 'eYiLsKVgQ6qTC6agD67Sig', -            'title': 'Toonami - Friday, October 14th, 2016', -            'description': 'md5:99892c96ffc85e159a428de85c30acde', +            'id': 'd8DEBj7QRfetLsRgFnGEyg', +            'ext': 'mp4', +            'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', +            'description': 'original programming',          }, -        'playlist': [{ -            'md5': '', -            'info_dict': { -                'id': 'eYiLsKVgQ6qTC6agD67Sig', -                'ext': 'mp4', -                'title': 'Toonami - Friday, October 14th, 2016', -                'description': 'md5:99892c96ffc85e159a428de85c30acde', -            }, -        }],          'params': {              # m3u8 download              'skip_download': True,          }, -        'expected_warnings': ['Unable to download f4m manifest'],      }] -    @staticmethod -    def find_video_info(collection, slug): -        for video in collection.get('videos'): -            if video.get('slug') == slug: -                return video - -    @staticmethod -    def find_collection_by_linkURL(collections, linkURL): -        for collection in collections: -            if collection.get('linkURL') == linkURL: -                return collection - -    @staticmethod -    def find_collection_containing_video(collections, slug): -        for collection in collections: -            for video in collection.get('videos'): -                if video.get('slug') == slug: -                    return collection, video -        return None, None -      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        show_path = mobj.group('show_path') -        episode_path = mobj.group('episode_path') -        is_playlist = True if mobj.group('is_playlist') else False - -        webpage = self._download_webpage(url, episode_path) - -        # Extract the value of `bootstrappedData` from the Javascript in the page. -        bootstrapped_data = self._parse_json(self._search_regex( -            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) - -        # Downloading videos from a /videos/playlist/ URL needs to be handled differently. -        # NOTE: We are only downloading one video (the current one) not the playlist -        if is_playlist: -            collections = bootstrapped_data['playlists']['collections'] -            collection = self.find_collection_by_linkURL(collections, show_path) -            video_info = self.find_video_info(collection, episode_path) - -            show_title = video_info['showTitle'] -            segment_ids = [video_info['videoPlaybackID']] +        show_path, episode_path = re.match(self._VALID_URL, url).groups() +        display_id = episode_path or show_path +        webpage = self._download_webpage(url, display_id) +        initial_data = self._parse_json(self._search_regex( +            r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', +            webpage, 'initial data'), display_id) + +        is_stream = show_path == 'streams' +        if is_stream: +            if not episode_path: +                episode_path = 'live-stream' + +            video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) +            video_id = video_data.get('stream') + +            if not video_id: +                entries = [] +                for episode in video_data.get('archiveEpisodes', []): +                    episode_url = episode.get('url') +                    if not episode_url: +                        continue +                    entries.append(self.url_result( +                        episode_url, 'AdultSwim', episode.get('id'))) +                return self.playlist_result( +                    entries, video_data.get('id'), video_data.get('title'), +                    strip_or_none(video_data.get('description')))          else: -            collections = bootstrapped_data['show']['collections'] -            collection, video_info = self.find_collection_containing_video(collections, episode_path) -            # Video wasn't found in the collections, let's try `slugged_video`. -            if video_info is None: -                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: -                    video_info = bootstrapped_data['slugged_video'] -            if not video_info: -                video_info = bootstrapped_data.get( -                    'heroMetadata', {}).get('trailer', {}).get('video') -            if not video_info: -                video_info = bootstrapped_data.get('onlineOriginals', [None])[0] -            if not video_info: -                raise ExtractorError('Unable to find video info') - -            show = bootstrapped_data['show'] -            show_title = show['title'] -            stream = video_info.get('stream') -            if stream and stream.get('videoPlaybackID'): -                segment_ids = [stream['videoPlaybackID']] -            elif video_info.get('clips'): -                segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] -            elif video_info.get('videoPlaybackID'): -                segment_ids = [video_info['videoPlaybackID']] -            elif video_info.get('id'): -                segment_ids = [video_info['id']] -            else: -                if video_info.get('auth') is True: -                    raise ExtractorError( -                        'This video is only available via cable service provider subscription that' -                        ' is not currently supported. You may want to use --cookies.', expected=True) -                else: -                    raise ExtractorError('Unable to find stream or clips') - -        episode_id = video_info['id'] -        episode_title = video_info['title'] -        episode_description = video_info.get('description') -        episode_duration = int_or_none(video_info.get('duration')) -        view_count = int_or_none(video_info.get('views')) +            show_data = initial_data['show'] + +            if not episode_path: +                entries = [] +                for video in show_data.get('videos', []): +                    slug = video.get('slug') +                    if not slug: +                        continue +                    entries.append(self.url_result( +                        'http://adultswim.com/videos/%s/%s' % (show_path, slug), +                        'AdultSwim', video.get('id'))) +                return self.playlist_result( +                    entries, show_data.get('id'), show_data.get('title'), +                    strip_or_none(show_data.get('metadata', {}).get('description'))) + +            video_data = show_data['sluggedVideo'] +            video_id = video_data['id'] + +        info = self._extract_cvp_info( +            'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, +            video_id, { +                'secure': { +                    'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', +                    'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', +                }, +            }, { +                'url': url, +                'site_name': 'AdultSwim', +                'auth_required': video_data.get('auth'), +            }) -        entries = [] -        for part_num, segment_id in enumerate(segment_ids): -            segement_info = self._extract_cvp_info( -                'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, -                segment_id, { -                    'secure': { -                        'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', -                        'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', -                    }, -                }) -            segment_title = '%s - %s' % (show_title, episode_title) -            if len(segment_ids) > 1: -                segment_title += ' Part %d' % (part_num + 1) -            segement_info.update({ -                'id': segment_id, -                'title': segment_title, -                'description': episode_description, +        info.update({ +            'id': video_id, +            'display_id': display_id, +            'description': info.get('description') or strip_or_none(video_data.get('description')), +        }) +        if not is_stream: +            info.update({ +                'duration': info.get('duration') or int_or_none(video_data.get('duration')), +                'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), +                'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), +                'episode': info['title'], +                'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')),              }) -            entries.append(segement_info) -        return { -            '_type': 'playlist', -            'id': episode_id, -            'display_id': episode_path, -            'entries': entries, -            'title': '%s - %s' % (show_title, episode_title), -            'description': episode_description, -            'duration': episode_duration, -            'view_count': view_count, -        } +            info['series'] = video_data.get('collection_title') or info.get('series') +            if info['series'] and info['series'] != info['title']: +                info['title'] = '%s - %s' % (info['series'], info['title']) + +        return info diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 000000000..6f241e683 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    float_or_none, +    try_get, +) + + +class AliExpressLiveIE(InfoExtractor): +    _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)' +    _TEST = { +        'url': 'https://live.aliexpress.com/live/2800002704436634', +        'md5': 'e729e25d47c5e557f2630eaf99b740a5', +        'info_dict': { +            'id': '2800002704436634', +            'ext': 'mp4', +            'title': 'CASIMA7.22', +            'thumbnail': r're:http://.*\.jpg', +            'uploader': 'CASIMA Official Store', +            'timestamp': 1500717600, +            'upload_date': '20170722', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        data = self._parse_json( +            self._search_regex( +                r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', +                webpage, 'runParams'), +            video_id) + +        title = data['title'] + +        formats = self._extract_m3u8_formats( +            data['replyStreamUrl'], video_id, 'mp4', +            entry_protocol='m3u8_native', m3u8_id='hls') + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': data.get('coverUrl'), +            'uploader': try_get( +                data, lambda x: x['followBar']['name'], compat_str), +            'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 388e578d5..c68be3134 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,9 +4,9 @@ from .common import InfoExtractor  class AlJazeeraIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' +    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' -    _TEST = { +    _TESTS = [{          'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',          'info_dict': {              'id': '3792260579001', @@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor):          },          'add_ie': ['BrightcoveNew'],          'skip': 'Not accessible from Travis CI server', -    } +    }, { +        'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', +        'only_matching': True, +    }]      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'      def _real_extract(self, url): diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 3a0ec6776..dd3b18d72 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals  from .theplatform import ThePlatformIE  from ..utils import ( -    update_url_query, -    parse_age_limit,      int_or_none, +    parse_age_limit, +    try_get, +    update_url_query,  ) @@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE):          info = self._parse_theplatform_metadata(theplatform_metadata)          video_id = theplatform_metadata['pid']          title = theplatform_metadata['title'] -        rating = theplatform_metadata['ratings'][0]['rating'] +        rating = try_get( +            theplatform_metadata, lambda x: x['ratings'][0]['rating'])          auth_required = self._search_regex(              r'window\.authRequired\s*=\s*(true|false);',              webpage, 'auth required') diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f2579..69d363311 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urlparse, -    compat_str, -) +from ..compat import compat_str  from ..utils import (      determine_ext,      extract_attributes,      ExtractorError, -    sanitized_Request,      urlencode_postdata, +    urljoin,  ) @@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor):      _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'      _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'      _NETRC_MACHINE = 'animeondemand' +    # German-speaking countries of Europe +    _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']      _TESTS = [{          # jap, OmU          'url': 'https://www.anime-on-demand.de/anime/161', @@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor):          # Full length film, non-series, ger/jap, Dub/OmU, account required          'url': 'https://www.anime-on-demand.de/anime/185',          'only_matching': True, +    }, { +        # Flash videos +        'url': 'https://www.anime-on-demand.de/anime/12', +        'only_matching': True,      }]      def _login(self): @@ -72,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor):              'post url', default=self._LOGIN_URL, group='url')          if not post_url.startswith('http'): -            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - -        request = sanitized_Request( -            post_url, urlencode_postdata(login_form)) -        request.add_header('Referer', self._LOGIN_URL) +            post_url = urljoin(self._LOGIN_URL, post_url)          response = self._download_webpage( -            request, None, 'Logging in as %s' % username) +            post_url, None, 'Logging in as %s' % username, +            data=urlencode_postdata(login_form), headers={ +                'Referer': self._LOGIN_URL, +            })          if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):              error = self._search_regex( @@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor):              formats = []              for input_ in re.findall( -                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): +                    r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):                  attributes = extract_attributes(input_) +                title = attributes.get('data-dialog-header')                  playlist_urls = [] -                for playlist_key in ('data-playlist', 'data-otherplaylist'): +                for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):                      playlist_url = attributes.get(playlist_key)                      if isinstance(playlist_url, compat_str) and re.match(                              r'/?[\da-zA-Z]+', playlist_url): @@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor):                          format_id_list.append(compat_str(num))                      format_id = '-'.join(format_id_list)                      format_note = ', '.join(filter(None, (kind, lang_note))) -                    request = sanitized_Request( -                        compat_urlparse.urljoin(url, playlist_url), +                    item_id_list = [] +                    if format_id: +                        item_id_list.append(format_id) +                    item_id_list.append('videomaterial') +                    playlist = self._download_json( +                        urljoin(url, playlist_url), video_id, +                        'Downloading %s JSON' % ' '.join(item_id_list),                          headers={                              'X-Requested-With': 'XMLHttpRequest',                              'X-CSRF-Token': csrf_token,                              'Referer': url,                              'Accept': 'application/json, text/javascript, */*; q=0.01', -                        }) -                    playlist = self._download_json( -                        request, video_id, 'Downloading %s playlist JSON' % format_id, -                        fatal=False) +                        }, fatal=False)                      if not playlist:                          continue +                    stream_url = playlist.get('streamurl') +                    if stream_url: +                        rtmp = re.search( +                            r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', +                            stream_url) +                        if rtmp: +                            formats.append({ +                                'url': rtmp.group('url'), +                                'app': rtmp.group('app'), +                                'play_path': rtmp.group('playpath'), +                                'page_url': url, +                                'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', +                                'rtmp_real_time': True, +                                'format_id': 'rtmp', +                                'ext': 'flv', +                            }) +                            continue                      start_video = playlist.get('startvideo', 0)                      playlist = playlist.get('playlist')                      if not playlist or not isinstance(playlist, list): @@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor):                      f.update({                          'id': '%s-%s' % (f['id'], m.group('kind').lower()),                          'title': m.group('title'), -                        'url': compat_urlparse.urljoin(url, m.group('href')), +                        'url': urljoin(url, m.group('href')),                      })                      entries.append(f) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 025e29aa4..e394cb661 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import ( -    ExtractorError, -    HEADRequest, +    int_or_none, +    mimetype2ext,  )  class AparatIE(InfoExtractor): -    _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'      _TEST = {          'url': 'http://www.aparat.com/v/wP8On', @@ -29,30 +29,41 @@ class AparatIE(InfoExtractor):          # Note: There is an easier-to-parse configuration at          # http://www.aparat.com/video/video/config/videohash/%video_id          # but the URL in there does not work -        embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id -        webpage = self._download_webpage(embed_url, video_id) - -        file_list = self._parse_json(self._search_regex( -            r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) -        for i, item in enumerate(file_list[0]): -            video_url = item['file'] -            req = HEADRequest(video_url) -            res = self._request_webpage( -                req, video_id, note='Testing video URL %d' % i, errnote=False) -            if res: -                break -        else: -            raise ExtractorError('No working video URLs found') +        webpage = self._download_webpage( +            'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, +            video_id)          title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + +        file_list = self._parse_json( +            self._search_regex( +                r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, +                'file list'), +            video_id) + +        formats = [] +        for item in file_list[0]: +            file_url = item.get('file') +            if not file_url: +                continue +            ext = mimetype2ext(item.get('type')) +            label = item.get('label') +            formats.append({ +                'url': file_url, +                'ext': ext, +                'format_id': label or ext, +                'height': int_or_none(self._search_regex( +                    r'(\d+)[pP]', label or '', 'height', default=None)), +            }) +        self._sort_formats(formats) +          thumbnail = self._search_regex(              r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)          return {              'id': video_id,              'title': title, -            'url': video_url, -            'ext': 'mp4',              'thumbnail': thumbnail,              'age_limit': self._family_friendly_search(webpage), +            'formats': formats,          } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2d5599456..3f248b147 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor):          duration = int_or_none(media_info.get('_duration'))          thumbnail = media_info.get('_previewImage') +        is_live = media_info.get('_isLive') is True          subtitles = {}          subtitle_url = media_info.get('_subtitleUrl') @@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor):              'id': video_id,              'duration': duration,              'thumbnail': thumbnail, +            'is_live': is_live,              'formats': formats,              'subtitles': subtitles,          } @@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor):          # determine video id from url          m = re.match(self._VALID_URL, url) +        document_id = None +          numid = re.search(r'documentId=([0-9]+)', url)          if numid: -            video_id = numid.group(1) +            document_id = video_id = numid.group(1)          else:              video_id = m.group('video_id') @@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor):                  'formats': formats,              }          else:  # request JSON file +            if not document_id: +                video_id = self._search_regex( +                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')              info = self._extract_media_info( -                'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) +                'http://www.ardmediathek.de/play/media/%s' % video_id, +                webpage, video_id)          info.update({              'id': video_id, -            'title': title, +            'title': self._live_title(title) if info.get('is_live') else title,              'description': description,              'thumbnail': thumbnail,          }) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56baef29d..5cde90c5b 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -9,12 +9,13 @@ from ..compat import (      compat_urllib_parse_urlparse,  )  from ..utils import ( +    ExtractorError,      find_xpath_attr, -    unified_strdate,      get_element_by_attribute,      int_or_none,      NO_DEFAULT,      qualities, +    unified_strdate,  )  # There are different sources of video in arte.tv, the extraction process @@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor):          info = self._download_json(json_url, video_id)          player_info = info['videoJsonPlayer'] +        vsr = player_info['VSR'] + +        if not vsr: +            raise ExtractorError( +                'Video %s is not available' % player_info.get('VID') or video_id, +                expected=True) +          upload_date_str = player_info.get('shootingDate')          if not upload_date_str:              upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] @@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor):          langcode = LANGS.get(lang, lang)          formats = [] -        for format_id, format_dict in player_info['VSR'].items(): +        for format_id, format_dict in vsr.items():              f = dict(format_dict)              versionCode = f.get('versionCode')              l = re.escape(langcode) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py new file mode 100644 index 000000000..594c88c9c --- /dev/null +++ b/youtube_dl/extractor/asiancrush.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( +    extract_attributes, +    remove_end, +    urlencode_postdata, +) + + +class AsianCrushIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' +    _TESTS = [{ +        'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', +        'md5': 'c3b740e48d0ba002a42c0b72857beae6', +        'info_dict': { +            'id': '1_y4tmjm5r', +            'ext': 'mp4', +            'title': 'Women Who Flirt', +            'description': 'md5:3db14e9186197857e7063522cb89a805', +            'timestamp': 1496936429, +            'upload_date': '20170608', +            'uploader_id': 'craig@crifkin.com', +        }, +    }, { +        'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        data = self._download_json( +            'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, +            data=urlencode_postdata({ +                'postid': video_id, +                'action': 'get_channel_kaltura_vars', +            })) + +        entry_id = data['entry_id'] + +        return self.url_result( +            'kaltura:%s:%s' % (data['partner_id'], entry_id), +            ie=KalturaIE.ie_key(), video_id=entry_id, +            video_title=data.get('vid_label')) + + +class AsianCrushPlaylistIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/series/0+(?P<id>\d+)s\b' +    _TEST = { +        'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', +        'info_dict': { +            'id': '12481', +            'title': 'Scholar Who Walks the Night', +            'description': 'md5:7addd7c5132a09fd4741152d96cce886', +        }, +        'playlist_count': 20, +    } + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [] + +        for mobj in re.finditer( +                r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, +                webpage): +            attrs = extract_attributes(mobj.group(0)) +            if attrs.get('class') == 'clearfix': +                entries.append(self.url_result( +                    mobj.group('url'), ie=AsianCrushIE.ie_key())) + +        title = remove_end( +            self._html_search_regex( +                r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, +                'title', default=None) or self._og_search_title( +                webpage, default=None) or self._html_search_meta( +                'twitter:title', webpage, 'title', +                default=None) or self._search_regex( +                r'<title>([^<]+)</title>', webpage, 'title', fatal=False), +            ' | AsianCrush') + +        description = self._og_search_description( +            webpage, default=None) or self._html_search_meta( +            'twitter:description', webpage, 'description', fatal=False) + +        return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index e48bb8972..393f381c6 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor):          def from_clip(field):              if clip: -                clip.get(field) +                return clip.get(field)          audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(              'audio', webpage, 'audio url') diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index df2972f26..be41bd5a2 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -14,14 +14,16 @@ from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    KNOWN_EXTENSIONS,      parse_filesize,      unescapeHTML,      update_url_query, +    unified_strdate,  )  class BandcampIE(InfoExtractor): -    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' +    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'      _TESTS = [{          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',          'md5': 'c557841d5e50261777a6585648adf439', @@ -47,6 +49,7 @@ class BandcampIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          title = mobj.group('title')          webpage = self._download_webpage(url, title) +        thumbnail = self._html_search_meta('og:image', webpage, default=None)          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)          if not m_download:              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) @@ -75,6 +78,7 @@ class BandcampIE(InfoExtractor):                  return {                      'id': track_id,                      'title': data['title'], +                    'thumbnail': thumbnail,                      'formats': formats,                      'duration': float_or_none(data.get('duration')),                  } @@ -143,7 +147,7 @@ class BandcampIE(InfoExtractor):          return {              'id': video_id,              'title': title, -            'thumbnail': info.get('thumb_url'), +            'thumbnail': info.get('thumb_url') or thumbnail,              'uploader': info.get('artist'),              'artist': artist,              'track': track, @@ -153,7 +157,7 @@ class BandcampIE(InfoExtractor):  class BandcampAlbumIE(InfoExtractor):      IE_NAME = 'Bandcamp:album' -    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' +    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'      _TESTS = [{          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -220,6 +224,12 @@ class BandcampAlbumIE(InfoExtractor):          'playlist_count': 2,      }] +    @classmethod +    def suitable(cls, url): +        return (False +                if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) +                else super(BandcampAlbumIE, cls).suitable(url)) +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          uploader_id = mobj.group('subdomain') @@ -232,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor):              raise ExtractorError('The page doesn\'t contain any tracks')          # Only tracks with duration info have songs          entries = [ -            self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) +            self.url_result( +                compat_urlparse.urljoin(url, t_path), +                ie=BandcampIE.ie_key(), +                video_title=self._search_regex( +                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', +                    elem_content, 'track title', fatal=False))              for elem_content, t_path in track_elements              if self._html_search_meta('duration', elem_content, default=None)] @@ -248,3 +263,92 @@ class BandcampAlbumIE(InfoExtractor):              'title': title,              'entries': entries,          } + + +class BandcampWeeklyIE(InfoExtractor): +    IE_NAME = 'Bandcamp:weekly' +    _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://bandcamp.com/?show=224', +        'md5': 'b00df799c733cf7e0c567ed187dea0fd', +        'info_dict': { +            'id': '224', +            'ext': 'opus', +            'title': 'BC Weekly April 4th 2017 - Magic Moments', +            'description': 'md5:5d48150916e8e02d030623a48512c874', +            'duration': 5829.77, +            'release_date': '20170404', +            'series': 'Bandcamp Weekly', +            'episode': 'Magic Moments', +            'episode_number': 208, +            'episode_id': '224', +        } +    }, { +        'url': 'https://bandcamp.com/?blah/blah@&show=228', +        'only_matching': True +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        blob = self._parse_json( +            self._search_regex( +                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, +                'blob', group='blob'), +            video_id, transform_source=unescapeHTML) + +        show = blob['bcw_show'] + +        # This is desired because any invalid show id redirects to `bandcamp.com` +        # which happens to expose the latest Bandcamp Weekly episode. +        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + +        formats = [] +        for format_id, format_url in show['audio_stream'].items(): +            if not isinstance(format_url, compat_str): +                continue +            for known_ext in KNOWN_EXTENSIONS: +                if known_ext in format_id: +                    ext = known_ext +                    break +            else: +                ext = None +            formats.append({ +                'format_id': format_id, +                'url': format_url, +                'ext': ext, +                'vcodec': 'none', +            }) +        self._sort_formats(formats) + +        title = show.get('audio_title') or 'Bandcamp Weekly' +        subtitle = show.get('subtitle') +        if subtitle: +            title += ' - %s' % subtitle + +        episode_number = None +        seq = blob.get('bcw_seq') + +        if seq and isinstance(seq, list): +            try: +                episode_number = next( +                    int_or_none(e.get('episode_number')) +                    for e in seq +                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) +            except StopIteration: +                pass + +        return { +            'id': video_id, +            'title': title, +            'description': show.get('desc') or show.get('short_desc'), +            'duration': float_or_none(show.get('audio_duration')), +            'is_live': False, +            'release_date': unified_strdate(show.get('published_date')), +            'series': 'Bandcamp Weekly', +            'episode': show.get('subtitle'), +            'episode_number': episode_number, +            'episode_id': compat_str(video_id), +            'formats': formats +        } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index dd65b8d86..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -6,14 +6,18 @@ import itertools  from .common import InfoExtractor  from ..utils import ( +    clean_html,      dict_get,      ExtractorError,      float_or_none, +    get_element_by_class,      int_or_none,      parse_duration,      parse_iso8601,      try_get,      unescapeHTML, +    urlencode_postdata, +    urljoin,  )  from ..compat import (      compat_etree_fromstring, @@ -25,19 +29,23 @@ from ..compat import (  class BBCCoUkIE(InfoExtractor):      IE_NAME = 'bbc.co.uk'      IE_DESC = 'BBC iPlayer' -    _ID_REGEX = r'[pb][\da-z]{7}' +    _ID_REGEX = r'[pbw][\da-z]{7}'      _VALID_URL = r'''(?x)                      https?://                          (?:www\.)?bbc\.co\.uk/                          (?:                              programmes/(?!articles/)|                              iplayer(?:/[^/]+)?/(?:episode/|playlist/)| -                            music/clips[/#]| -                            radio/player/ +                            music/(?:clips|audiovideo/popular)[/#]| +                            radio/player/| +                            events/[^/]+/play/[^/]+/                          )                          (?P<id>%s)(?!/(?:episodes|broadcasts|clips))                      ''' % _ID_REGEX +    _LOGIN_URL = 'https://account.bbc.com/signin' +    _NETRC_MACHINE = 'bbc' +      _MEDIASELECTOR_URLS = [          # Provides HQ HLS streams with even better quality that pc mediaset but fails          # with geolocation in some cases when it's even not geo restricted at all (e.g. @@ -222,11 +230,49 @@ class BBCCoUkIE(InfoExtractor):          }, {              'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',              'only_matching': True, -        } -    ] +        }, { +            'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', +            'only_matching': True, +        }, { +            'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', +            'only_matching': True, +        }]      _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            return + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, 'Downloading signin page') + +        login_form = self._hidden_inputs(login_page) + +        login_form.update({ +            'username': username, +            'password': password, +        }) + +        post_url = urljoin(self._LOGIN_URL, self._search_regex( +            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, +            'post url', default=self._LOGIN_URL, group='url')) + +        response, urlh = self._download_webpage_handle( +            post_url, None, 'Logging in', data=urlencode_postdata(login_form), +            headers={'Referer': self._LOGIN_URL}) + +        if self._LOGIN_URL in urlh.geturl(): +            error = clean_html(get_element_by_class('form-message', response)) +            if error: +                raise ExtractorError( +                    'Unable to login: %s' % error, expected=True) +            raise ExtractorError('Unable to log in') + +    def _real_initialize(self): +        self._login() +      class MediaSelectionError(Exception):          def __init__(self, id):              self.id = id @@ -483,6 +529,12 @@ class BBCCoUkIE(InfoExtractor):          webpage = self._download_webpage(url, group_id, 'Downloading video page') +        error = self._search_regex( +            r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', +            webpage, 'error', default=None) +        if error: +            raise ExtractorError(error, expected=True) +          programme_id = None          duration = None diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py index f3a9e3278..2eaec1ab4 100644 --- a/youtube_dl/extractor/beampro.py +++ b/youtube_dl/extractor/beampro.py @@ -6,18 +6,33 @@ from ..utils import (      ExtractorError,      clean_html,      compat_str, +    float_or_none,      int_or_none,      parse_iso8601,      try_get, +    urljoin,  ) -class BeamProLiveIE(InfoExtractor): -    IE_NAME = 'Beam:live' -    _VALID_URL = r'https?://(?:\w+\.)?beam\.pro/(?P<id>[^/?#&]+)' +class BeamProBaseIE(InfoExtractor): +    _API_BASE = 'https://mixer.com/api/v1'      _RATINGS = {'family': 0, 'teen': 13, '18+': 18} + +    def _extract_channel_info(self, chan): +        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) +        return { +            'uploader': chan.get('token') or try_get( +                chan, lambda x: x['user']['username'], compat_str), +            'uploader_id': compat_str(user_id) if user_id else None, +            'age_limit': self._RATINGS.get(chan.get('audience')), +        } + + +class BeamProLiveIE(BeamProBaseIE): +    IE_NAME = 'Mixer:live' +    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)'      _TEST = { -        'url': 'http://www.beam.pro/niterhayven', +        'url': 'http://mixer.com/niterhayven',          'info_dict': {              'id': '261562',              'ext': 'mp4', @@ -38,11 +53,17 @@ class BeamProLiveIE(InfoExtractor):          },      } +    _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE + +    @classmethod +    def suitable(cls, url): +        return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) +      def _real_extract(self, url):          channel_name = self._match_id(url)          chan = self._download_json( -            'https://beam.pro/api/v1/channels/%s' % channel_name, channel_name) +            '%s/channels/%s' % (self._API_BASE, channel_name), channel_name)          if chan.get('online') is False:              raise ExtractorError( @@ -50,24 +71,118 @@ class BeamProLiveIE(InfoExtractor):          channel_id = chan['id'] +        def manifest_url(kind): +            return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) +          formats = self._extract_m3u8_formats( -            'https://beam.pro/api/v1/channels/%s/manifest.m3u8' % channel_id, -            channel_name, ext='mp4', m3u8_id='hls', fatal=False) +            manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', +            fatal=False) +        formats.extend(self._extract_smil_formats( +            manifest_url('smil'), channel_name, fatal=False))          self._sort_formats(formats) -        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - -        return { +        info = {              'id': compat_str(chan.get('id') or channel_name),              'title': self._live_title(chan.get('name') or channel_name),              'description': clean_html(chan.get('description')), -            'thumbnail': try_get(chan, lambda x: x['thumbnail']['url'], compat_str), +            'thumbnail': try_get( +                chan, lambda x: x['thumbnail']['url'], compat_str),              'timestamp': parse_iso8601(chan.get('updatedAt')), -            'uploader': chan.get('token') or try_get( -                chan, lambda x: x['user']['username'], compat_str), -            'uploader_id': compat_str(user_id) if user_id else None, -            'age_limit': self._RATINGS.get(chan.get('audience')),              'is_live': True,              'view_count': int_or_none(chan.get('viewersTotal')),              'formats': formats,          } +        info.update(self._extract_channel_info(chan)) + +        return info + + +class BeamProVodIE(BeamProBaseIE): +    IE_NAME = 'Mixer:vod' +    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>\d+)' +    _TEST = { +        'url': 'https://mixer.com/willow8714?vod=2259830', +        'md5': 'b2431e6e8347dc92ebafb565d368b76b', +        'info_dict': { +            'id': '2259830', +            'ext': 'mp4', +            'title': 'willow8714\'s Channel', +            'duration': 6828.15, +            'thumbnail': r're:https://.*source\.png$', +            'timestamp': 1494046474, +            'upload_date': '20170506', +            'uploader': 'willow8714', +            'uploader_id': '6085379', +            'age_limit': 13, +            'view_count': int, +        }, +        'params': { +            'skip_download': True, +        }, +    } + +    @staticmethod +    def _extract_format(vod, vod_type): +        if not vod.get('baseUrl'): +            return [] + +        if vod_type == 'hls': +            filename, protocol = 'manifest.m3u8', 'm3u8_native' +        elif vod_type == 'raw': +            filename, protocol = 'source.mp4', 'https' +        else: +            assert False + +        data = vod.get('data') if isinstance(vod.get('data'), dict) else {} + +        format_id = [vod_type] +        if isinstance(data.get('Height'), compat_str): +            format_id.append('%sp' % data['Height']) + +        return [{ +            'url': urljoin(vod['baseUrl'], filename), +            'format_id': '-'.join(format_id), +            'ext': 'mp4', +            'protocol': protocol, +            'width': int_or_none(data.get('Width')), +            'height': int_or_none(data.get('Height')), +            'fps': int_or_none(data.get('Fps')), +            'tbr': int_or_none(data.get('Bitrate'), 1000), +        }] + +    def _real_extract(self, url): +        vod_id = self._match_id(url) + +        vod_info = self._download_json( +            '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) + +        state = vod_info.get('state') +        if state != 'AVAILABLE': +            raise ExtractorError( +                'VOD %s is not available (state: %s)' % (vod_id, state), +                expected=True) + +        formats = [] +        thumbnail_url = None + +        for vod in vod_info['vods']: +            vod_type = vod.get('format') +            if vod_type in ('hls', 'raw'): +                formats.extend(self._extract_format(vod, vod_type)) +            elif vod_type == 'thumbnail': +                thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') + +        self._sort_formats(formats) + +        info = { +            'id': vod_id, +            'title': vod_info.get('name') or vod_id, +            'duration': float_or_none(vod_info.get('duration')), +            'thumbnail': thumbnail_url, +            'timestamp': parse_iso8601(vod_info.get('createdAt')), +            'view_count': int_or_none(vod_info.get('viewsTotal')), +            'formats': formats, +        } +        info.update(self._extract_channel_info(vod_info.get('channel') or {})) + +        return info diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80dd8382e..1e57310d6 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -54,6 +54,22 @@ class BiliBiliIE(InfoExtractor):              'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',          },          'skip': 'Geo-restricted to China', +    }, { +        # Title with double quotes +        'url': 'http://www.bilibili.com/video/av8903802/', +        'info_dict': { +            'id': '8903802', +            'ext': 'mp4', +            'title': '阿滴英文|英文歌分享#6 "Closer', +            'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', +            'uploader': '阿滴英文', +            'uploader_id': '65880958', +            'timestamp': 1488382620, +            'upload_date': '20170301', +        }, +        'params': { +            'skip_download': True,  # Test metadata only +        },      }]      _APP_KEY = '84956560bc028eb7' @@ -122,6 +138,11 @@ class BiliBiliIE(InfoExtractor):                      'preference': -2 if 'hd.mp4' in backup_url else -3,                  }) +            for a_format in formats: +                a_format.setdefault('http_headers', {}).update({ +                    'Referer': url, +                }) +              self._sort_formats(formats)              entries.append({ @@ -130,7 +151,7 @@ class BiliBiliIE(InfoExtractor):                  'formats': formats,              }) -        title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') +        title = self._html_search_regex('<h1[^>]*>([^<]+)</h1>', webpage, 'title')          description = self._html_search_meta('description', webpage)          timestamp = unified_timestamp(self._html_search_regex(              r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade4f..07833532e 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor):          title = self._html_search_regex(              r'<h2 class="white">(.*?)</h2>', webpage, 'title')          video_info_dicts = re.findall( -            r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) +            r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)          formats = []          for video_info in video_info_dicts: -            video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) -            quality = video_info['quality'] -            video_url = video_info['src'] +            video_info = self._parse_json( +                video_info, video_id, transform_source=js_to_json, fatal=False) +            if not video_info: +                continue +            video_url = video_info.get('src') +            if not video_url: +                continue +            quality = 'high' if '_high' in video_url else 'low'              formats.append({                  'url': video_url,                  'preference': 10 if quality == 'high' else 0, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3f017a2b1..0ed59bcbc 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,6 +5,7 @@ import re  import json  from .common import InfoExtractor +from .adobepass import AdobePassIE  from ..compat import (      compat_etree_fromstring,      compat_parse_qs, @@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor):          return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE):      IE_NAME = 'brightcove:new'      _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'      _TESTS = [{ @@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor):                  raise ExtractorError(message, expected=True)              raise +        errors = json_data.get('errors') +        if errors and errors[0].get('error_subcode') == 'TVE_AUTH': +            custom_fields = json_data['custom_fields'] +            tve_token = self._extract_mvpd_auth( +                smuggled_data['source_url'], video_id, +                custom_fields['bcadobepassrequestorid'], +                custom_fields['bcadobepassresourceid']) +            json_data = self._download_json( +                api_url, video_id, headers={ +                    'Accept': 'application/json;pk=%s' % policy_key +                }, query={ +                    'tveToken': tve_token, +                }) +          title = json_data['name'].strip()          formats = [] @@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor):                      })                  formats.append(f) -        errors = json_data.get('errors')          if not formats and errors:              error = errors[0]              raise ExtractorError( @@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor):          is_live = False          duration = float_or_none(json_data.get('duration'), 1000) -        if duration and duration < 0: +        if duration is not None and duration <= 0:              is_live = True          return { diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index 75fa92d7c..ec411091e 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -84,9 +84,10 @@ class BuzzFeedIE(InfoExtractor):                  continue              entries.append(self.url_result(video['url'])) -        facebook_url = FacebookIE._extract_url(webpage) -        if facebook_url: -            entries.append(self.url_result(facebook_url)) +        facebook_urls = FacebookIE._extract_urls(webpage) +        entries.extend([ +            self.url_result(facebook_url) +            for facebook_url in facebook_urls])          return {              '_type': 'playlist', diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 87ad14e91..9faf40227 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor):          'media': 'http://search.yahoo.com/mrss/',          'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',      } +    _GEO_COUNTRIES = ['CA']      def _call_api(self, path, video_id):          url = path if path.startswith('http') else self._API_BASE_URL + path @@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor):  class CBCWatchVideoIE(CBCWatchBaseIE):      IE_NAME = 'cbc.ca:watch:video'      _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _TEST = { +        # geo-restricted to Canada, bypassable +        'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', +        'only_matching': True, +    }      def _real_extract(self, url):          video_id = self._match_id(url) @@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE):      IE_NAME = 'cbc.ca:watch'      _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'      _TESTS = [{ +        # geo-restricted to Canada, bypassable          'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',          'info_dict': { -            'id': '38e815a-009e3ab12e4', +            'id': '9673749a-5e77-484c-8b62-a1092a6b5168',              'ext': 'mp4',              'title': 'Customer (Dis)Service',              'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', @@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE):              'skip_download': True,              'format': 'bestvideo',          }, -        'skip': 'Geo-restricted to Canada',      }, { +        # geo-restricted to Canada, bypassable          'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',          'info_dict': {              'id': '1ed4b385-cd84-49cf-95f0-80f004680057', @@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE):              'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',          },          'playlist_mincount': 30, -        'skip': 'Geo-restricted to Canada',      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 58f258c54..1268e38ef 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -49,13 +49,13 @@ class CBSIE(CBSBaseIE):          'only_matching': True,      }] -    def _extract_video_info(self, content_id): +    def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):          items_data = self._download_xml(              'http://can.cbs.com/thunder/player/videoPlayerService.php', -            content_id, query={'partner': 'cbs', 'contentId': content_id}) +            content_id, query={'partner': site, 'contentId': content_id})          video_data = xpath_element(items_data, './/item')          title = xpath_text(video_data, 'videoTitle', 'title', True) -        tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id +        tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)          tp_release_url = 'http://link.theplatform.com/s/' + tp_path          asset_types = [] diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 57b18e81d..681d63e29 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -3,17 +3,18 @@ from __future__ import unicode_literals  import re -from .theplatform import ThePlatformIE +from .cbs import CBSIE  from ..utils import int_or_none -class CBSInteractiveIE(ThePlatformIE): -    _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)' +class CBSInteractiveIE(CBSIE): +    _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)'      _TESTS = [{          'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',          'info_dict': { -            'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', -            'ext': 'flv', +            'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', +            'display_id': 'hands-on-with-microsofts-windows-8-1-update', +            'ext': 'mp4',              'title': 'Hands-on with Microsoft Windows 8.1 Update',              'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',              'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', @@ -22,13 +23,19 @@ class CBSInteractiveIE(ThePlatformIE):              'timestamp': 1396479627,              'upload_date': '20140402',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', +        'md5': 'f11d27b2fa18597fbf92444d2a9ed386',          'info_dict': { -            'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', -            'ext': 'flv', +            'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', +            'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', +            'ext': 'mp4',              'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', -            'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', +            'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f',              'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',              'uploader': 'Ashley Esqueda',              'duration': 1482, @@ -38,23 +45,28 @@ class CBSInteractiveIE(ThePlatformIE):      }, {          'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/',          'info_dict': { -            'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', +            'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', +            'display_id': 'video-keeping-android-smartphones-and-tablets-secure',              'ext': 'mp4',              'title': 'Video: Keeping Android smartphones and tablets secure',              'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.',              'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0',              'uploader': 'Adrian Kingsley-Hughes', -            'timestamp': 1448961720, -            'upload_date': '20151201', +            'duration': 731, +            'timestamp': 1449129925, +            'upload_date': '20151203',          },          'params': {              # m3u8 download              'skip_download': True, -        } +        }, +    }, { +        'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', +        'only_matching': True,      }] -    TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' +      MPX_ACCOUNTS = { -        'cnet': 2288573011, +        'cnet': 2198311517,          'zdnet': 2387448114,      } @@ -68,7 +80,8 @@ class CBSInteractiveIE(ThePlatformIE):          data = self._parse_json(data_json, display_id)          vdata = data.get('video') or data['videos'][0] -        video_id = vdata['id'] +        video_id = vdata['mpxRefId'] +          title = vdata['title']          author = vdata.get('author')          if author: @@ -78,20 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):              uploader = None              uploader_id = None -        media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) -        formats, subtitles = [], {} -        for (fkey, vid) in vdata['files'].items(): -            if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: -                continue -            release_url = self.TP_RELEASE_URL_TEMPLATE % vid -            if fkey == 'hds': -                release_url += '&manifest=f4m' -            tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) -            formats.extend(tp_formats) -            subtitles = self._merge_subtitles(subtitles, tp_subtitles) -        self._sort_formats(formats) - -        info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) +        info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site])          info.update({              'id': video_id,              'display_id': display_id, @@ -99,7 +99,5 @@ class CBSInteractiveIE(ThePlatformIE):              'duration': int_or_none(vdata.get('duration')),              'uploader': uploader,              'uploader_id': uploader_id, -            'subtitles': subtitles, -            'formats': formats,          })          return info diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 17bb9af4f..51df15fac 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -15,19 +15,23 @@ class CBSNewsIE(CBSIE):      _TESTS = [          { -            'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', +            # 60 minutes +            'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/',              'info_dict': { -                'id': 'tesla-and-spacex-elon-musks-industrial-empire', -                'ext': 'flv', -                'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', -                'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', -                'duration': 791, +                'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_', +                'ext': 'mp4', +                'title': 'Artificial Intelligence', +                'description': 'md5:8818145f9974431e0fb58a1b8d69613c', +                'thumbnail': r're:^https?://.*\.jpg$', +                'duration': 1606, +                'uploader': 'CBSI-NEW', +                'timestamp': 1498431900, +                'upload_date': '20170625',              },              'params': { -                # rtmp download +                # m3u8 download                  'skip_download': True,              }, -            'skip': 'Subscribers only',          },          {              'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -52,6 +56,22 @@ class CBSNewsIE(CBSIE):                  'skip_download': True,              },          }, +        { +            # 48 hours +            'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', +            'info_dict': { +                'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1', +                'ext': 'mp4', +                'title': 'Cold as Ice', +                'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.', +                'upload_date': '20170604', +                'timestamp': 1496538000, +                'uploader': 'CBSI-NEW', +            }, +            'params': { +                'skip_download': True, +            }, +        },      ]      def _real_extract(self, url): @@ -60,12 +80,18 @@ class CBSNewsIE(CBSIE):          webpage = self._download_webpage(url, video_id)          video_info = self._parse_json(self._html_search_regex( -            r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', -            webpage, 'video JSON info'), video_id) +            r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', +            webpage, 'video JSON info', default='{}'), video_id, fatal=False) + +        if video_info: +            item = video_info['item'] if 'item' in video_info else video_info +        else: +            state = self._parse_json(self._search_regex( +                r'data-cbsvideoui-options=(["\'])(?P<json>{.+?})\1', webpage, +                'playlist JSON info', group='json'), video_id)['state'] +            item = state['playlist'][state['pid']] -        item = video_info['item'] if 'item' in video_info else video_info -        guid = item['mpxRefId'] -        return self._extract_video_info(guid) +        return self._extract_video_info(item['mpxRefId'], 'cbsnews')  class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 78b7a923c..0c3af23d5 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -124,7 +124,7 @@ class CDAIE(InfoExtractor):          }          def extract_format(page, version): -            json_str = self._search_regex( +            json_str = self._html_search_regex(                  r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,                  '%s player_json' % version, fatal=False, group='player_data')              if not json_str: diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f231..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end  class CharlieRoseIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'      _TESTS = [{          'url': 'https://charlierose.com/videos/27996',          'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor):      }, {          'url': 'https://charlierose.com/videos/27996',          'only_matching': True, +    }, { +        'url': 'https://charlierose.com/episodes/30887?autoplay=true', +        'only_matching': True,      }]      _PLAYER_BASE = 'https://charlierose.com/video/player/%s' diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96db..d4769da75 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -5,6 +5,7 @@ import base64  import json  from .common import InfoExtractor +from .youtube import YoutubeIE  from ..utils import (      clean_html,      ExtractorError @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor):          # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)          if native_platform is None: -            youtube_url = self._html_search_regex( -                r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', -                webpage, 'fallback video URL', default=None) -            if youtube_url is not None: -                return self.url_result(youtube_url, ie='Youtube') +            youtube_url = YoutubeIE._extract_url(webpage) +            if youtube_url: +                return self.url_result(youtube_url, ie=YoutubeIE.ie_key())          # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or          # the own CDN diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py index 562c9bbbb..b861d54b0 100644 --- a/youtube_dl/extractor/cinchcast.py +++ b/youtube_dl/extractor/cinchcast.py @@ -9,12 +9,20 @@ from ..utils import (  class CinchcastIE(InfoExtractor): -    _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)' -    _TEST = { +    _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', +        'info_dict': { +            'id': '5258197', +            'ext': 'mp3', +            'title': 'Train Your Brain to Up Your Game with Coach Mandy', +            'upload_date': '20130816', +        }, +    }, {          # Actual test is run in generic, look for undergroundwellness          'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703',          'only_matching': True, -    } +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py new file mode 100644 index 000000000..505bdbe16 --- /dev/null +++ b/youtube_dl/extractor/cjsw.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    unescapeHTML, +) + + +class CJSWIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', +        'md5': 'cee14d40f1e9433632c56e3d14977120', +        'info_dict': { +            'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', +            'ext': 'mp3', +            'title': 'Freshly Squeezed – Episode June 20, 2017', +            'description': 'md5:c967d63366c3898a80d0c7b0ff337202', +            'series': 'Freshly Squeezed', +            'episode_id': '20170620', +        }, +    }, { +        # no description +        'url': 'http://cjsw.com/program/road-pops/episode/20170707/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        program, episode_id = mobj.group('program', 'id') +        audio_id = '%s/%s' % (program, episode_id) + +        webpage = self._download_webpage(url, episode_id) + +        title = unescapeHTML(self._search_regex( +            (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', +             r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), +            webpage, 'title', group='title')) + +        audio_url = self._search_regex( +            r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', +            webpage, 'audio url', group='url') + +        audio_id = self._search_regex( +            r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', +            audio_url, 'audio id', default=audio_id) + +        formats = [{ +            'url': audio_url, +            'ext': determine_ext(audio_url, 'mp3'), +            'vcodec': 'none', +        }] + +        description = self._html_search_regex( +            r'<p>(?P<description>.+?)</p>', webpage, 'description', +            default=None) +        series = self._search_regex( +            r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, +            'series', default=program, group='name') + +        return { +            'id': audio_id, +            'title': title, +            'description': description, +            'formats': formats, +            'series': series, +            'episode_id': episode_id, +        } diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py deleted file mode 100644 index 0920f6219..000000000 --- a/youtube_dl/extractor/clipfish.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( -    int_or_none, -    unified_strdate, -) - - -class ClipfishIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', -        'md5': 'b9a5dc46294154c1193e2d10e0c95693', -        'info_dict': { -            'id': '4343170', -            'ext': 'mp4', -            'title': 'S01 E01 - Ugly Americans - Date in der Hölle', -            'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.', -            'upload_date': '20161005', -            'duration': 1291, -            'view_count': int, -        } -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        video_info = self._download_json( -            'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, -            video_id)['items'][0] - -        formats = [] - -        m3u8_url = video_info.get('media_videourl_hls') -        if m3u8_url: -            formats.append({ -                'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), -                'ext': 'mp4', -                'format_id': 'hls', -            }) - -        mp4_url = video_info.get('media_videourl') -        if mp4_url: -            formats.append({ -                'url': mp4_url, -                'format_id': 'mp4', -                'width': int_or_none(video_info.get('width')), -                'height': int_or_none(video_info.get('height')), -                'tbr': int_or_none(video_info.get('bitrate')), -            }) - -        descr = video_info.get('descr') -        if descr: -            descr = descr.strip() - -        return { -            'id': video_id, -            'title': video_info['title'], -            'description': descr, -            'formats': formats, -            'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), -            'duration': int_or_none(video_info.get('media_length')), -            'upload_date': unified_strdate(video_info.get('pubDate')), -            'view_count': int_or_none(video_info.get('media_views')) -        } diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 000000000..a1a7a774c --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    parse_iso8601, +    qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + +    _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' +    _TEST = { +        'url': 'https://www.clippituser.tv/c/evmgm', +        'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', +        'info_dict': { +            'id': 'evmgm', +            'ext': 'mp4', +            'title': 'Bye bye Brutus. #BattleBots  - Clippit', +            'uploader': 'lizllove', +            'uploader_url': 'https://www.clippituser.tv/p/lizllove', +            'timestamp': 1472183818, +            'upload_date': '20160826', +            'description': 'BattleBots | ABC', +            'thumbnail': r're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title') + +        FORMATS = ('sd', 'hd') +        quality = qualities(FORMATS) +        formats = [] +        for format_id in FORMATS: +            url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, +                                          webpage, 'url', fatal=False) +            if not url: +                continue +            match = re.search(r'/(?P<height>\d+)\.mp4', url) +            formats.append({ +                'url': url, +                'format_id': format_id, +                'quality': quality(format_id), +                'height': int(match.group('height')) if match else None, +            }) + +        uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', +                                           webpage, 'uploader', fatal=False) +        uploader_url = ('https://www.clippituser.tv/p/' + uploader +                        if uploader else None) + +        timestamp = self._html_search_regex(r'datetime="(.+?)"', +                                            webpage, 'date', fatal=False) +        thumbnail = self._html_search_regex(r'data-image="(.+?)"', +                                            webpage, 'thumbnail', fatal=False) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'uploader': uploader, +            'uploader_url': uploader_url, +            'timestamp': parse_iso8601(timestamp), +            'description': self._og_search_description(webpage), +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9bc8dbea4..85ca20ecc 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage( -            'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) +            'https://www.cloudy.ec/embed.php', video_id, query={ +                'id': video_id, +                'playerPage': 1, +                'autoplay': 1, +            })          info = self._parse_html5_media_entries(url, webpage, video_id)[0] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 76b5378e9..317a9a76f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import (      compat_urllib_parse_urlencode,      compat_urllib_request,      compat_urlparse, +    compat_xml_parse_error,  )  from ..downloader.f4m import remove_encrypted_media  from ..utils import ( @@ -376,7 +377,7 @@ class InfoExtractor(object):              cls._VALID_URL_RE = re.compile(cls._VALID_URL)          m = cls._VALID_URL_RE.match(url)          assert m -        return m.group('id') +        return compat_str(m.group('id'))      @classmethod      def working(cls): @@ -420,7 +421,7 @@ class InfoExtractor(object):              if country_code:                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)                  if self._downloader.params.get('verbose', False): -                    self._downloader.to_stdout( +                    self._downloader.to_screen(                          '[debug] Using fake IP %s (%s) as X-Forwarded-For.'                          % (self._x_forwarded_for_ip, country_code.upper())) @@ -646,15 +647,29 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): +                      transform_source=None, fatal=True, encoding=None, +                      data=None, headers={}, query={}):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) +            url_or_request, video_id, note, errnote, fatal=fatal, +            encoding=encoding, data=data, headers=headers, query=query)          if xml_string is False:              return xml_string +        return self._parse_xml( +            xml_string, video_id, transform_source=transform_source, +            fatal=fatal) + +    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):          if transform_source:              xml_string = transform_source(xml_string) -        return compat_etree_fromstring(xml_string.encode('utf-8')) +        try: +            return compat_etree_fromstring(xml_string.encode('utf-8')) +        except compat_xml_parse_error as ve: +            errmsg = '%s: Failed to parse XML ' % video_id +            if fatal: +                raise ExtractorError(errmsg, cause=ve) +            else: +                self.report_warning(errmsg + str(ve))      def _download_json(self, url_or_request, video_id,                         note='Downloading JSON metadata', @@ -730,12 +745,12 @@ class InfoExtractor(object):              video_info['title'] = video_title          return video_info -    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): -        urlrs = orderedSet( +    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): +        urls = orderedSet(              self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)              for m in matches)          return self.playlist_result( -            urlrs, playlist_id=video_id, playlist_title=video_title) +            urls, playlist_id=playlist_id, playlist_title=playlist_title)      @staticmethod      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): @@ -940,7 +955,8 @@ class InfoExtractor(object):      def _family_friendly_search(self, html):          # See http://schema.org/VideoObject -        family_friendly = self._html_search_meta('isFamilyFriendly', html) +        family_friendly = self._html_search_meta( +            'isFamilyFriendly', html, default=None)          if not family_friendly:              return None @@ -1002,17 +1018,17 @@ class InfoExtractor(object):                  item_type = e.get('@type')                  if expected_type is not None and expected_type != item_type:                      return info -                if item_type == 'TVEpisode': +                if item_type in ('TVEpisode', 'Episode'):                      info.update({                          'episode': unescapeHTML(e.get('name')),                          'episode_number': int_or_none(e.get('episodeNumber')),                          'description': unescapeHTML(e.get('description')),                      })                      part_of_season = e.get('partOfSeason') -                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': +                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):                          info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') -                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': +                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):                          info['series'] = unescapeHTML(part_of_series.get('name'))                  elif item_type == 'Article':                      info.update({ @@ -1022,10 +1038,10 @@ class InfoExtractor(object):                      })                  elif item_type == 'VideoObject':                      extract_video_object(e) -                elif item_type == 'WebPage': -                    video = e.get('video') -                    if isinstance(video, dict) and video.get('@type') == 'VideoObject': -                        extract_video_object(video) +                    continue +                video = e.get('video') +                if isinstance(video, dict) and video.get('@type') == 'VideoObject': +                    extract_video_object(video)                  break          return dict((k, v) for k, v in info.items() if v is not None) @@ -1785,7 +1801,7 @@ class InfoExtractor(object):                      ms_info['timescale'] = int(timescale)                  segment_duration = source.get('duration')                  if segment_duration: -                    ms_info['segment_duration'] = int(segment_duration) +                    ms_info['segment_duration'] = float(segment_duration)              def extract_Initialization(source):                  initialization = source.find(_add_ns('Initialization')) @@ -1892,9 +1908,13 @@ class InfoExtractor(object):                                  'Bandwidth': bandwidth,                              } +                        def location_key(location): +                            return 'url' if re.match(r'^https?://', location) else 'path' +                          if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:                              media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) +                            media_location_key = location_key(media_template)                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$                              # can't be used at the same time @@ -1904,7 +1924,7 @@ class InfoExtractor(object):                                      segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])                                      representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))                                  representation_ms_info['fragments'] = [{ -                                    'url': media_template % { +                                    media_location_key: media_template % {                                          'Number': segment_number,                                          'Bandwidth': bandwidth,                                      }, @@ -1928,7 +1948,7 @@ class InfoExtractor(object):                                          'Number': segment_number,                                      }                                      representation_ms_info['fragments'].append({ -                                        'url': segment_url, +                                        media_location_key: segment_url,                                          'duration': float_or_none(segment_d, representation_ms_info['timescale']),                                      }) @@ -1952,8 +1972,9 @@ class InfoExtractor(object):                              for s in representation_ms_info['s']:                                  duration = float_or_none(s['d'], timescale)                                  for r in range(s.get('r', 0) + 1): +                                    segment_uri = representation_ms_info['segment_urls'][segment_index]                                      fragments.append({ -                                        'url': representation_ms_info['segment_urls'][segment_index], +                                        location_key(segment_uri): segment_uri,                                          'duration': duration,                                      })                                      segment_index += 1 @@ -1962,6 +1983,7 @@ class InfoExtractor(object):                          # No fragments key is present in this case.                          if 'fragments' in representation_ms_info:                              f.update({ +                                'fragment_base_url': base_url,                                  'fragments': [],                                  'protocol': 'http_dash_segments',                              }) @@ -1969,10 +1991,8 @@ class InfoExtractor(object):                                  initialization_url = representation_ms_info['initialization_url']                                  if not f.get('url'):                                      f['url'] = initialization_url -                                f['fragments'].append({'url': initialization_url}) +                                f['fragments'].append({location_key(initialization_url): initialization_url})                              f['fragments'].extend(representation_ms_info['fragments']) -                            for fragment in f['fragments']: -                                fragment['url'] = urljoin(base_url, fragment['url'])                          try:                              existing_format = next(                                  fo for fo in formats @@ -2001,6 +2021,12 @@ class InfoExtractor(object):              compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)      def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): +        """ +        Parse formats from ISM manifest. +        References: +         1. [MS-SSTR]: Smooth Streaming Protocol, +            https://msdn.microsoft.com/en-us/library/ff469518.aspx +        """          if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:              return [] @@ -2022,8 +2048,11 @@ class InfoExtractor(object):                      self.report_warning('%s is not a supported codec' % fourcc)                      continue                  tbr = int(track.attrib['Bitrate']) // 1000 -                width = int_or_none(track.get('MaxWidth')) -                height = int_or_none(track.get('MaxHeight')) +                # [1] does not mention Width and Height attributes. However, +                # they're often present while MaxWidth and MaxHeight are +                # missing, so should be used as fallbacks +                width = int_or_none(track.get('MaxWidth') or track.get('Width')) +                height = int_or_none(track.get('MaxHeight') or track.get('Height'))                  sampling_rate = int_or_none(track.get('SamplingRate'))                  track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2101,19 +2130,19 @@ class InfoExtractor(object):                  return f              return {} -        def _media_formats(src, cur_media_type): +        def _media_formats(src, cur_media_type, type_info={}):              full_url = absolute_url(src) -            ext = determine_ext(full_url) +            ext = type_info.get('ext') or determine_ext(full_url)              if ext == 'm3u8':                  is_plain_url = False                  formats = self._extract_m3u8_formats(                      full_url, video_id, ext='mp4',                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, -                    preference=preference) +                    preference=preference, fatal=False)              elif ext == 'mpd':                  is_plain_url = False                  formats = self._extract_mpd_formats( -                    full_url, video_id, mpd_id=mpd_id) +                    full_url, video_id, mpd_id=mpd_id, fatal=False)              else:                  is_plain_url = True                  formats = [{ @@ -2123,15 +2152,18 @@ class InfoExtractor(object):              return is_plain_url, formats          entries = [] +        # amp-video and amp-audio are very similar to their HTML5 counterparts +        # so we wll include them right here (see +        # https://www.ampproject.org/docs/reference/components/amp-video)          media_tags = [(media_tag, media_type, '')                        for media_tag, media_type -                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] +                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]          media_tags.extend(re.findall(              # We only allow video|audio followed by a whitespace or '>'.              # Allowing more characters may end up in significant slow down (see              # https://github.com/rg3/youtube-dl/issues/11979, example URL:              # http://www.porntrex.com/maps/videositemap.xml). -            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) +            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))          for media_tag, media_type, media_content in media_tags:              media_info = {                  'formats': [], @@ -2149,9 +2181,15 @@ class InfoExtractor(object):                      src = source_attributes.get('src')                      if not src:                          continue -                    is_plain_url, formats = _media_formats(src, media_type) +                    f = parse_content_type(source_attributes.get('type')) +                    is_plain_url, formats = _media_formats(src, media_type, f)                      if is_plain_url: -                        f = parse_content_type(source_attributes.get('type')) +                        # res attribute is not standard but seen several times +                        # in the wild +                        f.update({ +                            'height': int_or_none(source_attributes.get('res')), +                            'format_id': source_attributes.get('label'), +                        })                          f.update(formats[0])                          media_info['formats'].append(f)                      else: @@ -2174,7 +2212,7 @@ class InfoExtractor(object):      def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):          formats = []          hdcore_sign = 'hdcore=3.7.0' -        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') +        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')          hds_host = hosts.get('hds')          if hds_host:              f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2196,8 +2234,9 @@ class InfoExtractor(object):      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):          url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) -        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') -        http_base_url = 'http' + url_base +        url_base = self._search_regex( +            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') +        http_base_url = '%s:%s' % ('http', url_base)          formats = []          if 'm3u8' not in skip_protocols:              formats.extend(self._extract_m3u8_formats( @@ -2231,7 +2270,7 @@ class InfoExtractor(object):              for protocol in ('rtmp', 'rtsp'):                  if protocol not in skip_protocols:                      formats.append({ -                        'url': protocol + url_base, +                        'url': '%s:%s' % (protocol, url_base),                          'format_id': protocol,                          'protocol': protocol,                      }) @@ -2289,6 +2328,8 @@ class InfoExtractor(object):              tracks = video_data.get('tracks')              if tracks and isinstance(tracks, list):                  for track in tracks: +                    if not isinstance(track, dict): +                        continue                      if track.get('kind') != 'captions':                          continue                      track_url = urljoin(base_url, track.get('file')) @@ -2318,6 +2359,8 @@ class InfoExtractor(object):          urls = []          formats = []          for source in jwplayer_sources_data: +            if not isinstance(source, dict): +                continue              source_url = self._proto_relative_url(source.get('file'))              if not source_url:                  continue diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index d3463b874..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,7 +16,6 @@ from ..utils import (      mimetype2ext,      orderedSet,      parse_iso8601, -    remove_end,  ) @@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor):          'wmagazine': 'W Magazine',      } -    _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) +    _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ +        (?: +            (?: +                embed(?:js)?| +                (?:script|inline)/video +            )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| +            (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) +        )''' % '|'.join(_SITES.keys())      IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) -    EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) +    EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())      _TESTS = [{          'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor):              'upload_date': '20150916',              'timestamp': 1442434955,          } +    }, { +        'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', +        'only_matching': True, +    }, { +        'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', +        'only_matching': True,      }]      def _extract_series(self, url, webpage): @@ -104,16 +116,16 @@ class CondeNastIE(InfoExtractor):          entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]          return self.playlist_result(entries, playlist_title=title) -    def _extract_video(self, webpage, url_type): -        query = {} -        params = self._search_regex( -            r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) -        if params: -            query.update({ -                'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), -                'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), -                'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), -            }) +    def _extract_video_params(self, webpage, display_id): +        query = self._parse_json( +            self._search_regex( +                r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', +                default='{}'), +            display_id, transform_source=js_to_json, fatal=False) +        if query: +            query['videoId'] = self._search_regex( +                r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', +                webpage, 'video id', default=None)          else:              params = extract_attributes(self._search_regex(                  r'(<[^>]+data-js="video-player"[^>]+>)', @@ -123,17 +135,40 @@ class CondeNastIE(InfoExtractor):                  'playerId': params['data-player'],                  'target': params['id'],              }) -        video_id = query['videoId'] +        return query + +    def _extract_video(self, params): +        video_id = params['videoId'] +          video_info = None + +        # New API path +        query = params.copy() +        query['embedType'] = 'inline'          info_page = self._download_json( -            'http://player.cnevids.com/player/video.js', -            video_id, 'Downloading video info', fatal=False, query=query) +            'http://player.cnevids.com/embed-api.json', video_id, +            'Downloading embed info', fatal=False, query=query) + +        # Old fallbacks +        if not info_page: +            if params.get('playerId'): +                info_page = self._download_json( +                    'http://player.cnevids.com/player/video.js', video_id, +                    'Downloading video info', fatal=False, query=params)          if info_page:              video_info = info_page.get('video')          if not video_info:              info_page = self._download_webpage(                  'http://player.cnevids.com/player/loader.js', -                video_id, 'Downloading loader info', query=query) +                video_id, 'Downloading loader info', query=params) +        if not video_info: +            info_page = self._download_webpage( +                'https://player.cnevids.com/inline/video/%s.js' % video_id, +                video_id, 'Downloading inline info', query={ +                    'target': params.get('target', 'embedplayer') +                }) + +        if not video_info:              video_info = self._parse_json(                  self._search_regex(                      r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), @@ -161,9 +196,7 @@ class CondeNastIE(InfoExtractor):              })          self._sort_formats(formats) -        info = self._search_json_ld( -            webpage, video_id, fatal=False) if url_type != 'embed' else {} -        info.update({ +        return {              'id': video_id,              'formats': formats,              'title': title, @@ -174,22 +207,26 @@ class CondeNastIE(InfoExtractor):              'series': video_info.get('series_title'),              'season': video_info.get('season_title'),              'timestamp': parse_iso8601(video_info.get('premiere_date')), -        }) -        return info +            'categories': video_info.get('categories'), +        }      def _real_extract(self, url): -        site, url_type, item_id = re.match(self._VALID_URL, url).groups() +        video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() -        # Convert JS embed to regular embed -        if url_type == 'embedjs': -            parsed_url = compat_urlparse.urlparse(url) -            url = compat_urlparse.urlunparse(parsed_url._replace( -                path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) -            url_type = 'embed' +        if video_id: +            return self._extract_video({ +                'videoId': video_id, +                'playerId': player_id, +                'target': target, +            }) -        webpage = self._download_webpage(url, item_id) +        webpage = self._download_webpage(url, display_id)          if url_type == 'series':              return self._extract_series(url, webpage)          else: -            return self._extract_video(webpage, url_type) +            params = self._extract_video_params(webpage, display_id) +            info = self._search_json_ld( +                webpage, display_id, fatal=False) +            info.update(self._extract_video(params)) +            return info diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py index 7b2f5008b..807a29eea 100644 --- a/youtube_dl/extractor/corus.py +++ b/youtube_dl/extractor/corus.py @@ -8,7 +8,16 @@ from ..utils import int_or_none  class CorusIE(ThePlatformFeedIE): -    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:globaltv|etcanada)\.com|(?:hgtv|foodnetwork|slice)\.ca)/(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))(?P<id>\d+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:www\.)? +                        (?P<domain> +                            (?:globaltv|etcanada)\.com| +                            (?:hgtv|foodnetwork|slice|history|showcase)\.ca +                        ) +                        /(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=)) +                        (?P<id>\d+) +                    '''      _TESTS = [{          'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/',          'md5': '05dcbca777bf1e58c2acbb57168ad3a6', @@ -27,6 +36,12 @@ class CorusIE(ThePlatformFeedIE):      }, {          'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/',          'only_matching': True, +    }, { +        'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video', +        'only_matching': True, +    }, { +        'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video', +        'only_matching': True,      }]      _TP_FEEDS = { @@ -50,6 +65,14 @@ class CorusIE(ThePlatformFeedIE):              'feed_id': '5tUJLgV2YNJ5',              'account_id': 2414427935,          }, +        'history': { +            'feed_id': 'tQFx_TyyEq4J', +            'account_id': 2369613659, +        }, +        'showcase': { +            'feed_id': '9H6qyshBZU3E', +            'account_id': 2414426607, +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce2a..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from .youtube import YoutubeIE  from ..utils import (      parse_iso8601,      str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        youtube_url = self._search_regex( -            r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', -            webpage, 'youtube url', default=None) +        youtube_url = YoutubeIE._extract_url(webpage)          if youtube_url: -            return self.url_result(youtube_url, 'Youtube') +            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())          video_url = self._html_search_regex(              [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f919ed208..13f425b2b 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor):              'season_number': 8,              'episode_number': 4,              'subtitles': { -                'en-US': [{ -                    'ext': 'ttml', -                }] +                'en-US': [ +                    {'ext': 'vtt'}, +                    {'ext': 'tt'}, +                ]              },          },          'params': { diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ffa4a7f8..8bdaf0c2c 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -510,7 +510,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          # webpage provide more accurate data than series_title from XML          series = self._html_search_regex( -            r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', +            r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',              webpage, 'series', fatal=False)          season = xpath_text(metadata, 'series_title') @@ -518,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          episode_number = int_or_none(xpath_text(metadata, 'episode_number'))          season_number = int_or_none(self._search_regex( -            r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)', +            r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',              webpage, 'season number', default=None))          return { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d4576160b..171820e27 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,7 @@ from ..utils import (      smuggle_url,      determine_ext,      ExtractorError, +    extract_attributes,  )  from .senateisvp import SenateISVPIE  from .ustream import UstreamIE @@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor):              'uploader_id': '12987475',          },      }] +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'      def _real_extract(self, url):          video_id = self._match_id(url) @@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor):          if ustream_url:              return self.url_result(ustream_url, UstreamIE.ie_key()) +        if '&vod' not in url: +            bc = self._search_regex( +                r"(<[^>]+id='brightcove-player-embed'[^>]+>)", +                webpage, 'brightcove embed', default=None) +            if bc: +                bc_attr = extract_attributes(bc) +                bc_url = self.BRIGHTCOVE_URL_TEMPLATE % ( +                    bc_attr.get('data-bcaccountid', '3162030207001'), +                    bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'), +                    bc_attr.get('data-newbcplayerid', 'default'), +                    bc_attr['data-bcid']) +                return self.url_result(smuggle_url(bc_url, {'source_url': url})) +          # We first look for clipid, because clipprog always appears before          patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]          results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 98c835bf1..af3978035 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -1,17 +1,21 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      int_or_none,      determine_protocol, +    try_get,      unescapeHTML,  )  class DailyMailIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' +    _TESTS = [{          'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',          'md5': 'f6129624562251f628296c3a9ffde124',          'info_dict': { @@ -20,7 +24,16 @@ class DailyMailIE(InfoExtractor):              'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',              'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',          } -    } +    }, { +        'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_urls(webpage): +        return re.findall( +            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', +            webpage)      def _real_extract(self, url):          video_id = self._match_id(url) @@ -28,8 +41,14 @@ class DailyMailIE(InfoExtractor):          video_data = self._parse_json(self._search_regex(              r"data-opts='({.+?})'", webpage, 'video data'), video_id)          title = unescapeHTML(video_data['title']) -        video_sources = self._download_json(video_data.get( -            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + +        sources_url = (try_get( +            video_data, +            (lambda x: x['plugins']['sources']['url'], +             lambda x: x['sources']['url']), compat_str) or +            'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + +        video_sources = self._download_json(sources_url, video_id)          formats = []          for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 441114d19..e9d0dd19c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):  class DailymotionIE(DailymotionBaseInfoExtractor): -    _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' +    _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'      IE_NAME = 'dailymotion'      _FORMATS = [ @@ -49,87 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor):          ('stream_h264_hd1080_url', 'hd180'),      ] -    _TESTS = [ -        { -            'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', -            'md5': '074b95bdee76b9e3654137aee9c79dfe', -            'info_dict': { -                'id': 'x5kesuj', -                'ext': 'mp4', -                'title': 'Office Christmas Party Review –  Jason Bateman, Olivia Munn, T.J. Miller', -                'description': 'Office Christmas Party Review -  Jason Bateman, Olivia Munn, T.J. Miller', -                'thumbnail': r're:^https?:.*\.(?:jpg|png)$', -                'duration': 187, -                'timestamp': 1493651285, -                'upload_date': '20170501', -                'uploader': 'Deadline', -                'uploader_id': 'x1xm8ri', -                'age_limit': 0, -                'view_count': int, -            }, +    _TESTS = [{ +        'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', +        'md5': '074b95bdee76b9e3654137aee9c79dfe', +        'info_dict': { +            'id': 'x5kesuj', +            'ext': 'mp4', +            'title': 'Office Christmas Party Review –  Jason Bateman, Olivia Munn, T.J. Miller', +            'description': 'Office Christmas Party Review -  Jason Bateman, Olivia Munn, T.J. Miller', +            'thumbnail': r're:^https?:.*\.(?:jpg|png)$', +            'duration': 187, +            'timestamp': 1493651285, +            'upload_date': '20170501', +            'uploader': 'Deadline', +            'uploader_id': 'x1xm8ri', +            'age_limit': 0, +            'view_count': int,          }, -        { -            'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', -            'md5': '2137c41a8e78554bb09225b8eb322406', -            'info_dict': { -                'id': 'x2iuewm', -                'ext': 'mp4', -                'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', -                'description': 'Several come bundled with the Steam Controller.', -                'thumbnail': r're:^https?:.*\.(?:jpg|png)$', -                'duration': 74, -                'timestamp': 1425657362, -                'upload_date': '20150306', -                'uploader': 'IGN', -                'uploader_id': 'xijv66', -                'age_limit': 0, -                'view_count': int, -            }, -            'skip': 'video gone', +    }, { +        'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', +        'md5': '2137c41a8e78554bb09225b8eb322406', +        'info_dict': { +            'id': 'x2iuewm', +            'ext': 'mp4', +            'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', +            'description': 'Several come bundled with the Steam Controller.', +            'thumbnail': r're:^https?:.*\.(?:jpg|png)$', +            'duration': 74, +            'timestamp': 1425657362, +            'upload_date': '20150306', +            'uploader': 'IGN', +            'uploader_id': 'xijv66', +            'age_limit': 0, +            'view_count': int,          }, +        'skip': 'video gone', +    }, {          # Vevo video -        { -            'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', -            'info_dict': { -                'title': 'Roar (Official)', -                'id': 'USUV71301934', -                'ext': 'mp4', -                'uploader': 'Katy Perry', -                'upload_date': '20130905', -            }, -            'params': { -                'skip_download': True, -            }, -            'skip': 'VEVO is only available in some countries', +        'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', +        'info_dict': { +            'title': 'Roar (Official)', +            'id': 'USUV71301934', +            'ext': 'mp4', +            'uploader': 'Katy Perry', +            'upload_date': '20130905',          }, +        'params': { +            'skip_download': True, +        }, +        'skip': 'VEVO is only available in some countries', +    }, {          # age-restricted video -        { -            'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', -            'md5': '0d667a7b9cebecc3c89ee93099c4159d', -            'info_dict': { -                'id': 'xyh2zz', -                'ext': 'mp4', -                'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', -                'uploader': 'HotWaves1012', -                'age_limit': 18, -            }, -            'skip': 'video gone', +        'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', +        'md5': '0d667a7b9cebecc3c89ee93099c4159d', +        'info_dict': { +            'id': 'xyh2zz', +            'ext': 'mp4', +            'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', +            'uploader': 'HotWaves1012', +            'age_limit': 18,          }, +        'skip': 'video gone', +    }, {          # geo-restricted, player v5 -        { -            'url': 'http://www.dailymotion.com/video/xhza0o', -            'only_matching': True, -        }, +        'url': 'http://www.dailymotion.com/video/xhza0o', +        'only_matching': True, +    }, {          # with subtitles -        { -            'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', -            'only_matching': True, -        }, -        { -            'url': 'http://www.dailymotion.com/swf/video/x3n92nf', -            'only_matching': True, -        } -    ] +        'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', +        'only_matching': True, +    }, { +        'url': 'http://www.dailymotion.com/swf/video/x3n92nf', +        'only_matching': True, +    }, { +        'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', +        'only_matching': True, +    }]      @staticmethod      def _extract_urls(webpage): @@ -152,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):          view_count_str = self._search_regex(              (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',               r'video_views_count[^>]+>\s+([\s\d\,.]+)'), -            webpage, 'view count', fatal=False) +            webpage, 'view count', default=None)          if view_count_str:              view_count_str = re.sub(r'\s', '', view_count_str)          view_count = str_to_int(view_count_str) @@ -164,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              [r'buildPlayer\(({.+?})\);\n',  # See https://github.com/rg3/youtube-dl/issues/7826               r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',               r'buildPlayer\(({.+?})\);', -             r'var\s+config\s*=\s*({.+?});'], +             r'var\s+config\s*=\s*({.+?});', +             # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580) +             r'__PLAYER_CONFIG__\s*=\s*({.+?});'],              webpage, 'player v5', default=None)          if player_v5:              player = self._parse_json(player_v5, video_id) @@ -328,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):  class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):      IE_NAME = 'dailymotion:playlist' -    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' +    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'      _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'      _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'      _TESTS = [{ diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 939d1338c..968c4c7fd 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -15,7 +15,7 @@ from ..utils import (  class DisneyIE(InfoExtractor):      _VALID_URL = r'''(?x) -        https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))''' +        https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))'''      _TESTS = [{          # Disney.EmbedVideo          'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', @@ -69,6 +69,9 @@ class DisneyIE(InfoExtractor):          'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo',          'only_matching': True,      }, { +        'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268', +        'only_matching': True, +    }, {          'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue',          'only_matching': True,      }] diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index a78cb8a2a..c05f601e2 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -13,7 +13,7 @@ from ..utils import (  class DigitallySpeakingIE(InfoExtractor): -    _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' +    _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'      _TESTS = [{          # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface @@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):          # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC          'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',          'only_matching': True, +    }, { +        # From http://www.gdcvault.com/play/1013700/Advanced-Material +        'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', +        'only_matching': True,      }]      def _parse_mp4(self, metadata): diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index d22133d24..9757f4422 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals  import time  import hashlib +import re  from .common import InfoExtractor  from ..utils import (      ExtractorError,      unescapeHTML, +    unified_strdate, +    urljoin,  ) @@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor):              'uploader': uploader,              'is_live': True,          } + + +class DouyuShowIE(InfoExtractor): +    _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' + +    _TESTS = [{ +        'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', +        'md5': '0c2cfd068ee2afe657801269b2d86214', +        'info_dict': { +            'id': 'rjNBdvnVXNzvE2yw', +            'ext': 'mp4', +            'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', +            'duration': 7150.08, +            'thumbnail': r're:^https?://.*\.jpg$', +            'uploader': '陈一发儿', +            'uploader_id': 'XrZwYelr5wbK', +            'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', +            'upload_date': '20170402', +        }, +    }, { +        'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        url = url.replace('vmobile.', 'v.') +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        room_info = self._parse_json(self._search_regex( +            r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + +        video_info = None + +        for trial in range(5): +            # Sometimes Douyu rejects our request. Let's try it more times +            try: +                video_info = self._download_json( +                    'https://vmobile.douyu.com/video/getInfo', video_id, +                    query={'vid': video_id}, +                    headers={ +                        'Referer': url, +                        'x-requested-with': 'XMLHttpRequest', +                    }) +                break +            except ExtractorError: +                self._sleep(1, video_id) + +        if not video_info: +            raise ExtractorError('Can\'t fetch video info') + +        formats = self._extract_m3u8_formats( +            video_info['data']['video_url'], video_id, +            entry_protocol='m3u8_native', ext='mp4') + +        upload_date = unified_strdate(self._html_search_regex( +            r'<em>上传时间:</em><span>([^<]+)</span>', webpage, +            'upload date', fatal=False)) + +        uploader = uploader_id = uploader_url = None +        mobj = re.search( +            r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', +            webpage) +        if mobj: +            uploader_id, uploader = mobj.groups() +            uploader_url = urljoin(url, '/author/' + uploader_id) + +        return { +            'id': video_id, +            'title': room_info['name'], +            'formats': formats, +            'duration': room_info.get('duration'), +            'thumbnail': room_info.get('pic'), +            'upload_date': upload_date, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'uploader_url': uploader_url, +        } diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 87c5dd63e..76e784105 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -7,16 +7,18 @@ import time  from .common import InfoExtractor  from ..compat import ( -    compat_urlparse,      compat_HTTPError, +    compat_str, +    compat_urlparse,  )  from ..utils import ( -    USER_AGENTS,      ExtractorError,      int_or_none, -    unified_strdate,      remove_end, +    try_get, +    unified_strdate,      update_url_query, +    USER_AGENTS,  ) @@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        info_url = self._search_regex( -            r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', -            webpage, 'video id') -          title = remove_end(self._og_search_title(webpage), ' | Dplay') -        try: -            info = self._download_json( -                info_url, display_id, headers={ -                    'Authorization': 'Bearer %s' % self._get_cookies(url).get( -                        'dplayit_token').value, -                    'Referer': url, -                }) -        except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): -                info = self._parse_json(e.cause.read().decode('utf-8'), display_id) -                error = info['errors'][0] -                if error.get('code') == 'access.denied.geoblocked': -                    self.raise_geo_restricted( -                        msg=error.get('detail'), countries=self._GEO_COUNTRIES) -                raise ExtractorError(info['errors'][0]['detail'], expected=True) -            raise +        video_id = None + +        info = self._search_regex( +            r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', +            webpage, 'playback JSON', default=None) +        if info: +            for _ in range(2): +                info = self._parse_json(info, display_id, fatal=False) +                if not info: +                    break +            else: +                video_id = try_get(info, lambda x: x['data']['id']) + +        if not info: +            info_url = self._search_regex( +                r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', +                webpage, 'info url') + +            video_id = info_url.rpartition('/')[-1] + +            try: +                info = self._download_json( +                    info_url, display_id, headers={ +                        'Authorization': 'Bearer %s' % self._get_cookies(url).get( +                            'dplayit_token').value, +                        'Referer': url, +                    }) +            except ExtractorError as e: +                if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): +                    info = self._parse_json(e.cause.read().decode('utf-8'), display_id) +                    error = info['errors'][0] +                    if error.get('code') == 'access.denied.geoblocked': +                        self.raise_geo_restricted( +                            msg=error.get('detail'), countries=self._GEO_COUNTRIES) +                    raise ExtractorError(info['errors'][0]['detail'], expected=True) +                raise          hls_url = info['data']['attributes']['streaming']['hls']['url'] @@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor):              season_number = episode_number = upload_date = None          return { -            'id': info_url.rpartition('/')[-1], +            'id': compat_str(video_id or display_id),              'display_id': display_id,              'title': title,              'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index e7abc8889..9a498d72a 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -12,6 +12,7 @@ from ..utils import (      ExtractorError,      clean_html,      int_or_none, +    remove_end,      sanitized_Request,      urlencode_postdata  ) @@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE):          'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',          'info_dict': {              'id': '4512.1', -            'ext': 'mp4', -            'title': 'Cooking with Shin 4512.1', +            'ext': 'flv', +            'title': 'Cooking with Shin',              'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',              'episode': 'Episode 1',              'episode_number': 1,              'thumbnail': r're:^https?://.*\.jpg',              'timestamp': 1404336058,              'upload_date': '20140702', -            'duration': 343, +            'duration': 344,          },          'params': {              # m3u8 download @@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE):          'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',          'info_dict': {              'id': '4826.4', -            'ext': 'mp4', -            'title': 'Mnet Asian Music Awards 2015 4826.4', +            'ext': 'flv', +            'title': 'Mnet Asian Music Awards 2015',              'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',              'episode': 'Mnet Asian Music Awards 2015 - Part 3',              'episode_number': 4,              'thumbnail': r're:^https?://.*\.jpg',              'timestamp': 1450213200,              'upload_date': '20151215', -            'duration': 5602, +            'duration': 5359,          },          'params': {              # m3u8 download @@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE):                      countries=self._GEO_COUNTRIES)              raise +        # title is postfixed with video id for some reason, removing +        if info.get('title'): +            info['title'] = remove_end(info['title'], video_id).strip() +          series_id, episode_number = video_id.split('.')          episode_info = self._download_json(              # We only need a single episode info, so restricting page size to one episode diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 79ec212c8..164e97c36 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -1,135 +1,59 @@  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor  from ..utils import ( -    int_or_none, -    parse_iso8601, +    js_to_json, +    parse_duration, +    unescapeHTML,  )  class DRBonanzaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]|$)' - -    _TESTS = [{ -        'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', +    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' +    _TEST = { +        'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-',          'info_dict': { -            'id': '65517', +            'id': '40312', +            'display_id': 'matador---0824-komme-fremmede-',              'ext': 'mp4', -            'title': 'Talkshowet - Leonard Cohen', -            'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca', -            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', -            'timestamp': 1295537932, -            'upload_date': '20110120', -            'duration': 3664, -        }, -        'params': { -            'skip_download': True,  # requires rtmp -        }, -    }, { -        'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', -        'md5': '6dfe039417e76795fb783c52da3de11d', -        'info_dict': { -            'id': '59410', -            'ext': 'mp3', -            'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission', -            'description': 'md5:501e5a195749480552e214fbbed16c4e', +            'title': 'MATADOR - 08:24. "Komme fremmede".', +            'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84',              'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', -            'timestamp': 1223274900, -            'upload_date': '20081006', -            'duration': 7369, +            'duration': 4613,          }, -    }] +    }      def _real_extract(self, url): -        url_id = self._match_id(url) -        webpage = self._download_webpage(url, url_id) - -        if url_id: -            info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json')) -        else: -            # Just fetch the first video on that page -            info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json')) - -        asset_id = str(info['AssetId']) -        title = info['Title'].rstrip(' \'\"-,.:;!?') -        duration = int_or_none(info.get('Duration'), scale=1000) -        # First published online. "FirstPublished" contains the date for original airing. -        timestamp = parse_iso8601( -            re.sub(r'\.\d+$', '', info['Created'])) - -        def parse_filename_info(url): -            match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url) -            if match: -                return { -                    'width': int(match.group('width')), -                    'height': int(match.group('height')), -                    'vbr': int(match.group('bitrate')), -                    'ext': match.group('ext') -                } -            match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url) -            if match: -                return { -                    'vbr': int(match.group('bitrate')), -                    'ext': match.group(2) -                } -            return {} +        mobj = re.match(self._VALID_URL, url) +        video_id, display_id = mobj.group('id', 'display_id') -        video_types = ['VideoHigh', 'VideoMid', 'VideoLow'] -        preferencemap = { -            'VideoHigh': -1, -            'VideoMid': -2, -            'VideoLow': -3, -            'Audio': -4, -        } +        webpage = self._download_webpage(url, display_id) -        formats = [] -        for file in info['Files']: -            if info['Type'] == 'Video': -                if file['Type'] in video_types: -                    format = parse_filename_info(file['Location']) -                    format.update({ -                        'url': file['Location'], -                        'format_id': file['Type'].replace('Video', ''), -                        'preference': preferencemap.get(file['Type'], -10), -                    }) -                    if format['url'].startswith('rtmp'): -                        rtmp_url = format['url'] -                        format['rtmp_live'] = True  # --resume does not work -                        if '/bonanza/' in rtmp_url: -                            format['play_path'] = rtmp_url.split('/bonanza/')[1] -                    formats.append(format) -                elif file['Type'] == 'Thumb': -                    thumbnail = file['Location'] -            elif info['Type'] == 'Audio': -                if file['Type'] == 'Audio': -                    format = parse_filename_info(file['Location']) -                    format.update({ -                        'url': file['Location'], -                        'format_id': file['Type'], -                        'vcodec': 'none', -                    }) -                    formats.append(format) -                elif file['Type'] == 'Thumb': -                    thumbnail = file['Location'] +        info = self._parse_html5_media_entries( +            url, webpage, display_id, m3u8_id='hls', +            m3u8_entry_protocol='m3u8_native')[0] +        self._sort_formats(info['formats']) -        description = '%s\n%s\n%s\n' % ( -            info['Description'], info['Actors'], info['Colophon']) +        asset = self._parse_json( +            self._search_regex( +                r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'), +            display_id, transform_source=js_to_json) -        self._sort_formats(formats) +        title = unescapeHTML(asset['AssetTitle']).strip() -        display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id -        display_id = re.sub(r'-+', '-', display_id) +        def extract(field): +            return self._search_regex( +                r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field, +                webpage, field, default=None) -        return { -            'id': asset_id, +        info.update({ +            'id': asset.get('AssetId') or video_id,              'display_id': display_id,              'title': title, -            'formats': formats, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -        } +            'description': extract('Programinfo'), +            'duration': parse_duration(extract('Tid')), +            'thumbnail': asset.get('AssetImageUrl'), +        }) +        return info diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 1eca82b3b..c5d56a9ad 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -44,8 +44,23 @@ class DrTuberIE(InfoExtractor):          webpage = self._download_webpage(              'http://www.drtuber.com/video/%s' % video_id, display_id) -        video_url = self._html_search_regex( -            r'<source src="([^"]+)"', webpage, 'video URL') +        video_data = self._download_json( +            'http://www.drtuber.com/player_config_json/', video_id, query={ +                'vid': video_id, +                'embed': 0, +                'aid': 0, +                'domain_id': 0, +            }) + +        formats = [] +        for format_id, video_url in video_data['files'].items(): +            if video_url: +                formats.append({ +                    'format_id': format_id, +                    'quality': 2 if format_id == 'hq' else 1, +                    'url': video_url +                }) +        self._sort_formats(formats)          title = self._html_search_regex(              (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', @@ -75,7 +90,7 @@ class DrTuberIE(InfoExtractor):          return {              'id': video_id,              'display_id': display_id, -            'url': video_url, +            'formats': formats,              'title': title,              'thumbnail': thumbnail,              'like_count': like_count, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e4917014a..69effba58 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor):      IE_NAME = 'drtv'      _TESTS = [{          'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', -        'md5': '25e659cccc9a2ed956110a299fdf5983', +        'md5': '7ae17b4e18eb5d29212f424a7511c184',          'info_dict': {              'id': 'klassen-darlig-taber-10',              'ext': 'mp4', @@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor):              'upload_date': '20160823',              'duration': 606.84,          }, -        'params': { -            'skip_download': True, -        },      }, { +        # embed          'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', -        'md5': '2c37175c718155930f939ef59952474a',          'info_dict': {              'id': 'christiania-pusher-street-ryddes-drdkrjpo',              'ext': 'mp4',              'title': 'LIVE Christianias rydning af Pusher Street er i gang', -            'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', +            'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',              'timestamp': 1472800279,              'upload_date': '20160902',              'duration': 131.4,          }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # with SignLanguage formats +        'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', +        'info_dict': { +            'id': 'historien-om-danmark-stenalder', +            'ext': 'mp4', +            'title': 'Historien om Danmark: Stenalder (1)', +            'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', +            'timestamp': 1490401996, +            'upload_date': '20170325', +            'duration': 3502.04, +            'formats': 'mincount:20', +        }, +        'params': { +            'skip_download': True, +        },      }]      def _real_extract(self, url): @@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor):              elif kind in ('VideoResource', 'AudioResource'):                  duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)                  restricted_to_denmark = asset.get('RestrictedToDenmark') -                spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' +                asset_target = asset.get('Target')                  for link in asset.get('Links', []):                      uri = link.get('Uri')                      if not uri: @@ -96,13 +112,13 @@ class DRTVIE(InfoExtractor):                      target = link.get('Target')                      format_id = target or ''                      preference = None -                    if spoken_subtitles: +                    if asset_target in ('SpokenSubtitles', 'SignLanguage'):                          preference = -1 -                        format_id += '-spoken-subtitles' +                        format_id += '-%s' % asset_target                      if target == 'HDS':                          f4m_formats = self._extract_f4m_formats(                              uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', -                            video_id, preference, f4m_id=format_id) +                            video_id, preference, f4m_id=format_id, fatal=False)                          if kind == 'AudioResource':                              for f in f4m_formats:                                  f['vcodec'] = 'none' @@ -110,7 +126,8 @@ class DRTVIE(InfoExtractor):                      elif target == 'HLS':                          formats.extend(self._extract_m3u8_formats(                              uri, video_id, 'mp4', entry_protocol='m3u8_native', -                            preference=preference, m3u8_id=format_id)) +                            preference=preference, m3u8_id=format_id, +                            fatal=False))                      else:                          bitrate = link.get('Bitrate')                          if bitrate: diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 974c69dbc..e85c58bd5 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -5,9 +5,12 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext, +    ExtractorError, +    int_or_none,      js_to_json, +    mimetype2ext,      unescapeHTML, -    ExtractorError,  ) @@ -24,14 +27,7 @@ class DVTVIE(InfoExtractor):              'id': 'dc0768de855511e49e4b0025900fea04',              'ext': 'mp4',              'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně', -        } -    }, { -        'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/', -        'md5': '6388f1941b48537dbd28791f712af8bf', -        'info_dict': { -            'id': '72c02230849211e49f60002590604f2e', -            'ext': 'mp4', -            'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala', +            'duration': 1484,          }      }, {          'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', @@ -44,55 +40,100 @@ class DVTVIE(InfoExtractor):              'info_dict': {                  'id': 'b0b40906854d11e4bdad0025900fea04',                  'ext': 'mp4', -                'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne' +                'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne', +                'description': 'md5:0916925dea8e30fe84222582280b47a0', +                'timestamp': 1418760010, +                'upload_date': '20141216',              }          }, {              'md5': '5f7652a08b05009c1292317b449ffea2',              'info_dict': {                  'id': '420ad9ec854a11e4bdad0025900fea04',                  'ext': 'mp4', -                'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka' +                'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka', +                'description': 'md5:ff2f9f6de73c73d7cef4f756c1c1af42', +                'timestamp': 1418760010, +                'upload_date': '20141216',              }          }, {              'md5': '498eb9dfa97169f409126c617e2a3d64',              'info_dict': {                  'id': '95d35580846a11e4b6d20025900fea04',                  'ext': 'mp4', -                'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?' +                'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?', +                'description': 'md5:889fe610a70fee5511dc3326a089188e', +                'timestamp': 1418760010, +                'upload_date': '20141216',              }          }, {              'md5': 'b8dc6b744844032dab6ba3781a7274b9',              'info_dict': {                  'id': '6fe14d66853511e4833a0025900fea04',                  'ext': 'mp4', -                'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády' +                'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády', +                'description': 'md5:544f86de6d20c4815bea11bf2ac3004f', +                'timestamp': 1418760010, +                'upload_date': '20141216',              }          }],      }, { +        'url': 'https://video.aktualne.cz/dvtv/zeman-si-jen-leci-mindraky-sobotku-nenavidi-a-babis-se-mu-te/r~960cdb3a365a11e7a83b0025900fea04/', +        'md5': 'f8efe9656017da948369aa099788c8ea', +        'info_dict': { +            'id': '3c496fec365911e7a6500025900fea04', +            'ext': 'mp4', +            'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta', +            'duration': 1103, +        }, +        'params': { +            'skip_download': True, +        }, +    }, {          'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',          'only_matching': True,      }]      def _parse_video_metadata(self, js, video_id): -        metadata = self._parse_json(js, video_id, transform_source=js_to_json) +        data = self._parse_json(js, video_id, transform_source=js_to_json) -        formats = [] -        for video in metadata['sources']: -            ext = video['type'][6:] -            formats.append({ -                'url': video['file'], -                'ext': ext, -                'format_id': '%s-%s' % (ext, video['label']), -                'height': int(video['label'].rstrip('p')), -                'fps': 25, -            }) +        title = unescapeHTML(data['title']) +        formats = [] +        for video in data['sources']: +            video_url = video.get('file') +            if not video_url: +                continue +            video_type = video.get('type') +            ext = determine_ext(video_url, mimetype2ext(video_type)) +            if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif video_type == 'application/dash+xml' or ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    video_url, video_id, mpd_id='dash', fatal=False)) +            else: +                label = video.get('label') +                height = self._search_regex( +                    r'^(\d+)[pP]', label or '', 'height', default=None) +                format_id = ['http'] +                for f in (ext, label): +                    if f: +                        format_id.append(f) +                formats.append({ +                    'url': video_url, +                    'format_id': '-'.join(format_id), +                    'height': int_or_none(height), +                })          self._sort_formats(formats)          return { -            'id': metadata['mediaid'], -            'title': unescapeHTML(metadata['title']), -            'thumbnail': self._proto_relative_url(metadata['image'], 'http:'), +            'id': data.get('mediaid') or video_id, +            'title': title, +            'description': data.get('description'), +            'thumbnail': data.get('image'), +            'duration': int_or_none(data.get('duration')), +            'timestamp': int_or_none(data.get('pubtime')),              'formats': formats          } @@ -103,7 +144,7 @@ class DVTVIE(InfoExtractor):          # single video          item = self._search_regex( -            r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});", +            r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});',              webpage, 'video', default=None, fatal=False)          if item: @@ -113,6 +154,8 @@ class DVTVIE(InfoExtractor):          items = re.findall(              r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);",              webpage) +        if not items: +            items = re.findall(r'(?s)var\s+asset\s*=\s*({.+?});\n', webpage)          if items:              return { diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 76d39adac..42789278e 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -11,6 +11,7 @@ from ..compat import (  from ..utils import (      ExtractorError,      int_or_none, +    unsmuggle_url,  ) @@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor):              'view_count': int,          },          'skip': 'Georestricted', +    }, { +        # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) +        'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', +        'only_matching': True,      }]      @staticmethod @@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor):              webpage)          if mobj is not None:              return mobj.group('url') -        # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) +        PLAYER_JS_RE = r''' +                        <script[^>]+ +                            src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) +                        .+? +                    ''' +        # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)          mobj = re.search(              r'''(?xs) -                    <script[^>]+ -                        src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) -                    .+? +                    %s                      <div[^>]+ -                        class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ +                        class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+                          data-id=["\'](?P<id>\d+) -            ''', webpage) +            ''' % PLAYER_JS_RE, webpage) +        if mobj is not None: +            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() +        # Generalization of "Javascript code usage", "Combined usage" and +        # "Usage without attaching to DOM" embeddings (see +        # http://dultonmedia.github.io/eplayer/) +        mobj = re.search( +            r'''(?xs) +                    %s +                    <script> +                    .+? +                    new\s+EaglePlayer\( +                        (?:[^,]+\s*,\s*)? +                        { +                            .+? +                            \bid\s*:\s*["\']?(?P<id>\d+) +                            .+? +                        } +                    \s*\) +                    .+? +                    </script> +            ''' % PLAYER_JS_RE, webpage)          if mobj is not None:              return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor):          if status != 200:              raise ExtractorError(' '.join(response['errors']), expected=True) -    def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): +    def _download_json(self, url_or_request, video_id, *args, **kwargs):          try: -            response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) +            response = super(EaglePlatformIE, self)._download_json( +                url_or_request, video_id, *args, **kwargs)          except ExtractorError as ee:              if isinstance(ee.cause, compat_HTTPError):                  response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) @@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor):          return self._download_json(url_or_request, video_id, note)['data'][0]      def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +          mobj = re.match(self._VALID_URL, url)          host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') +        headers = {} +        query = { +            'id': video_id, +        } + +        referrer = smuggled_data.get('referrer') +        if referrer: +            headers['Referer'] = referrer +            query['referrer'] = referrer +          player_data = self._download_json( -            'http://%s/api/player_data?id=%s' % (host, video_id), video_id) +            'http://%s/api/player_data' % host, video_id, +            headers=headers, query=query)          media = player_data['data']['playlist']['viewports'][0]['medialist'][0] diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index db921465e..e4a3046af 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -1,15 +1,18 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    try_get, +    unified_timestamp, +)  class EggheadCourseIE(InfoExtractor):      IE_DESC = 'egghead.io course'      IE_NAME = 'egghead:course' -    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' +    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'      _TEST = {          'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',          'playlist_count': 29, @@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor):      def _real_extract(self, url):          playlist_id = self._match_id(url) -        webpage = self._download_webpage(url, playlist_id) -        title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') -        ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') +        course = self._download_json( +            'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + +        entries = [ +            self.url_result( +                'wistia:%s' % lesson['wistia_id'], ie='Wistia', +                video_id=lesson['wistia_id'], video_title=lesson.get('title')) +            for lesson in course['lessons'] if lesson.get('wistia_id')] + +        return self.playlist_result( +            entries, playlist_id, course.get('title'), +            course.get('description')) + + +class EggheadLessonIE(InfoExtractor): +    IE_DESC = 'egghead.io lesson' +    IE_NAME = 'egghead:lesson' +    _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', +        'info_dict': { +            'id': 'fv5yotjxcg', +            'ext': 'mp4', +            'title': 'Create linear data flow with container style types (Box)', +            'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', +            'thumbnail': r're:^https?:.*\.jpg$', +            'timestamp': 1481296768, +            'upload_date': '20161209', +            'duration': 304, +            'view_count': 0, +            'tags': ['javascript', 'free'], +        }, +        'params': { +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        lesson_id = self._match_id(url) -        found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) -        entries = [self.url_result(m) for m in found] +        lesson = self._download_json( +            'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)          return { -            '_type': 'playlist', -            'id': playlist_id, -            'title': title, -            'description': self._og_search_description(webpage), -            'entries': entries, +            '_type': 'url_transparent', +            'ie_key': 'Wistia', +            'url': 'wistia:%s' % lesson['wistia_id'], +            'id': lesson['wistia_id'], +            'title': lesson.get('title'), +            'description': lesson.get('summary'), +            'thumbnail': lesson.get('thumb_nail'), +            'timestamp': unified_timestamp(lesson.get('published_at')), +            'duration': int_or_none(lesson.get('duration')), +            'view_count': int_or_none(lesson.get('plays_count')), +            'tags': try_get(lesson, lambda x: x['tag_list'], list),          } diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 8795e0ddf..7a7436068 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -10,7 +10,25 @@ from ..utils import (  class ESPNIE(InfoExtractor): -    _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?:(?:\w+\.)+)?espn\.go| +                            (?:www\.)?espn +                        )\.com/ +                        (?: +                            (?: +                                video/clip| +                                watch/player +                            ) +                            (?: +                                \?.*?\bid=| +                                /_/id/ +                            ) +                        ) +                        (?P<id>\d+) +                    ''' +      _TESTS = [{          'url': 'http://espn.go.com/video/clip?id=10365079',          'info_dict': { @@ -25,21 +43,35 @@ class ESPNIE(InfoExtractor):              'skip_download': True,          },      }, { -        # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season -        'url': 'http://espn.go.com/video/clip?id=2743663', +        'url': 'https://broadband.espn.go.com/video/clip?id=18910086',          'info_dict': { -            'id': '2743663', +            'id': '18910086',              'ext': 'mp4', -            'title': 'Must-See Moments: Best of the MLS season', -            'description': 'md5:4c2d7232beaea572632bec41004f0aeb', -            'timestamp': 1449446454, -            'upload_date': '20151207', +            'title': 'Kyrie spins around defender for two', +            'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b', +            'timestamp': 1489539155, +            'upload_date': '20170315',          },          'params': {              'skip_download': True,          },          'expected_warnings': ['Unable to download f4m manifest'],      }, { +        'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672', +        'only_matching': True, +    }, { +        'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', +        'only_matching': True, +    }, { +        'url': 'http://www.espn.com/watch/player?id=19141491', +        'only_matching': True, +    }, { +        'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', +        'only_matching': True, +    }, { +        'url': 'http://www.espn.com/watch/player/_/id/19141491', +        'only_matching': True, +    }, {          'url': 'http://www.espn.com/video/clip?id=10365079',          'only_matching': True,      }, { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c0020dd7d..a3a97e940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -45,6 +45,7 @@ from .anvato import AnvatoIE  from .anysex import AnySexIE  from .aol import AolIE  from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE  from .aparat import AparatIE  from .appleconnect import AppleConnectIE  from .appletrailers import ( @@ -71,6 +72,10 @@ from .arte import (      TheOperaPlatformIE,      ArteTVPlaylistIE,  ) +from .asiancrush import ( +    AsianCrushIE, +    AsianCrushPlaylistIE, +)  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE  from .atvat import ATVAtIE @@ -90,7 +95,7 @@ from .azmedien import (  )  from .baidu import BaiduVideoIE  from .bambuser import BambuserIE, BambuserChannelIE -from .bandcamp import BandcampIE, BandcampAlbumIE +from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE  from .bbc import (      BBCCoUkIE,      BBCCoUkArticleIE, @@ -98,7 +103,10 @@ from .bbc import (      BBCCoUkPlaylistIE,      BBCIE,  ) -from .beampro import BeamProLiveIE +from .beampro import ( +    BeamProLiveIE, +    BeamProVodIE, +)  from .beeg import BeegIE  from .behindkink import BehindKinkIE  from .bellmedia import BellMediaIE @@ -178,8 +186,9 @@ from .chirbit import (      ChirbitProfileIE,  )  from .cinchcast import CinchcastIE -from .clipfish import ClipfishIE +from .cjsw import CJSWIE  from .cliphunter import CliphunterIE +from .clippit import ClippitIE  from .cliprs import ClipRsIE  from .clipsyndicate import ClipsyndicateIE  from .closertotruth import CloserToTruthIE @@ -251,7 +260,10 @@ from .democracynow import DemocracynowIE  from .dfb import DFBIE  from .dhm import DHMIE  from .dotsub import DotsubIE -from .douyutv import DouyuTVIE +from .douyutv import ( +    DouyuShowIE, +    DouyuTVIE, +)  from .dplay import (      DPlayIE,      DPlayItIE, @@ -287,7 +299,10 @@ from .dw import (  from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE -from .egghead import EggheadCourseIE +from .egghead import ( +    EggheadCourseIE, +    EggheadLessonIE, +)  from .ehow import EHowIE  from .eighttracks import EightTracksIE  from .einthusan import EinthusanIE @@ -337,7 +352,12 @@ from .flipagram import FlipagramIE  from .folketinget import FolketingetIE  from .footyroom import FootyRoomIE  from .formula1 import Formula1IE -from .fourtube import FourTubeIE +from .fourtube import ( +    FourTubeIE, +    PornTubeIE, +    PornerBrosIE, +    FuxIE, +)  from .fox import FOXIE  from .fox9 import FOX9IE  from .foxgay import FoxgayIE @@ -350,9 +370,9 @@ from .foxsports import FoxSportsIE  from .franceculture import FranceCultureIE  from .franceinter import FranceInterIE  from .francetv import ( -    PluzzIE, -    FranceTvInfoIE,      FranceTVIE, +    FranceTVEmbedIE, +    FranceTVInfoIE,      GenerationQuoiIE,      CultureboxIE,  ) @@ -386,7 +406,6 @@ from .globo import (  from .go import GoIE  from .go90 import Go90IE  from .godtube import GodTubeIE -from .godtv import GodTVIE  from .golem import GolemIE  from .googledrive import GoogleDriveIE  from .googleplus import GooglePlusIE @@ -460,6 +479,7 @@ from .jamendo import (  )  from .jeuxvideo import JeuxVideoIE  from .jove import JoveIE +from .joj import JojIE  from .jwplatform import JWPlatformIE  from .jpopsukitv import JpopsukiIE  from .kaltura import KalturaIE @@ -490,6 +510,7 @@ from .la7 import LA7IE  from .laola1tv import (      Laola1TvEmbedIE,      Laola1TvIE, +    ITTFIE,  )  from .lci import LCIIE  from .lcp import ( @@ -517,7 +538,10 @@ from .limelight import (      LimelightChannelListIE,  )  from .litv import LiTVIE -from .liveleak import LiveLeakIE +from .liveleak import ( +    LiveLeakIE, +    LiveLeakEmbedIE, +)  from .livestream import (      LivestreamIE,      LivestreamOriginalIE, @@ -540,9 +564,12 @@ from .mangomolo import (      MangomoloVideoIE,      MangomoloLiveIE,  ) +from .manyvids import ManyVidsIE  from .matchtv import MatchTVIE  from .mdr import MDRIE +from .mediaset import MediasetIE  from .medici import MediciIE +from .megaphone import MegaphoneIE  from .meipai import MeipaiIE  from .melonvod import MelonVODIE  from .meta import METAIE @@ -569,7 +596,6 @@ from .mixcloud import (  )  from .mlb import MLBIE  from .mnet import MnetIE -from .mpora import MporaIE  from .moevideo import MoeVideoIE  from .mofosex import MofosexIE  from .mojvideo import MojvideoIE @@ -630,7 +656,10 @@ from .neteasemusic import (      NetEaseMusicProgramIE,      NetEaseMusicDjRadioIE,  ) -from .newgrounds import NewgroundsIE +from .newgrounds import ( +    NewgroundsIE, +    NewgroundsPlaylistIE, +)  from .newstube import NewstubeIE  from .nextmedia import (      NextMediaIE, @@ -638,6 +667,10 @@ from .nextmedia import (      AppleDailyIE,      NextTVIE,  ) +from .nexx import ( +    NexxIE, +    NexxEmbedIE, +)  from .nfb import NFBIE  from .nfl import NFLIE  from .nhk import NhkVodIE @@ -651,6 +684,7 @@ from .nick import (      NickIE,      NickDeIE,      NickNightIE, +    NickRuIE,  )  from .niconico import NiconicoIE, NiconicoPlaylistIE  from .ninecninemedia import ( @@ -663,6 +697,7 @@ from .nintendo import NintendoIE  from .njpwworld import NJPWWorldIE  from .nobelprize import NobelPrizeIE  from .noco import NocoIE +from .nonktube import NonkTubeIE  from .noovo import NoovoIE  from .normalboots import NormalbootsIE  from .nosvideo import NosVideoIE @@ -732,8 +767,9 @@ from .openload import OpenloadIE  from .ora import OraTVIE  from .orf import (      ORFTVthekIE, -    ORFOE1IE,      ORFFM4IE, +    ORFFM4StoryIE, +    ORFOE1IE,      ORFIPTVIE,  )  from .packtpub import ( @@ -745,6 +781,7 @@ from .pandoratv import PandoraTVIE  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE +from .pearvideo import PearVideoIE  from .people import PeopleIE  from .periscope import (      PeriscopeIE, @@ -810,11 +847,16 @@ from .radiobremen import RadioBremenIE  from .radiofrance import RadioFranceIE  from .rai import (      RaiPlayIE, +    RaiPlayLiveIE,      RaiIE,  )  from .rbmaradio import RBMARadioIE  from .rds import RDSIE  from .redbulltv import RedBullTVIE +from .reddit import ( +    RedditIE, +    RedditRIE, +)  from .redtube import RedTubeIE  from .regiotv import RegioTVIE  from .rentv import ( @@ -858,9 +900,11 @@ from .rutube import (      RutubeEmbedIE,      RutubeMovieIE,      RutubePersonIE, +    RutubePlaylistIE,  )  from .rutv import RUTVIE  from .ruutu import RuutuIE +from .ruv import RuvIE  from .sandia import SandiaIE  from .safari import (      SafariIE, @@ -907,8 +951,9 @@ from .soundcloud import (      SoundcloudIE,      SoundcloudSetIE,      SoundcloudUserIE, +    SoundcloudTrackStationIE,      SoundcloudPlaylistIE, -    SoundcloudSearchIE +    SoundcloudSearchIE,  )  from .soundgasm import (      SoundgasmIE, @@ -957,6 +1002,7 @@ from .tagesschau import (      TagesschauIE,  )  from .tass import TassIE +from .tastytrade import TastyTradeIE  from .tbs import TBSIE  from .tdslifeway import TDSLifewayIE  from .teachertube import ( @@ -965,7 +1011,6 @@ from .teachertube import (  )  from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE -from .teamfourstar import TeamFourStarIE  from .techtalks import TechTalksIE  from .ted import TEDIE  from .tele13 import Tele13IE @@ -1014,11 +1059,6 @@ from .trilulilu import TriluliluIE  from .trutv import TruTVIE  from .tube8 import Tube8IE  from .tubitv import TubiTvIE -from .tudou import ( -    TudouIE, -    TudouPlaylistIE, -    TudouAlbumIE, -)  from .tumblr import TumblrIE  from .tunein import (      TuneInClipIE, @@ -1098,6 +1138,10 @@ from .uplynk import (      UplynkIE,      UplynkPreplayIE,  ) +from .upskill import ( +    UpskillIE, +    UpskillCourseIE, +)  from .urort import UrortIE  from .urplay import URPlayIE  from .usanetwork import USANetworkIE @@ -1125,6 +1169,7 @@ from .vgtv import (  from .vh1 import VH1IE  from .vice import (      ViceIE, +    ViceArticleIE,      ViceShowIE,  )  from .viceland import VicelandIE @@ -1187,12 +1232,14 @@ from .vk import (  )  from .vlive import (      VLiveIE, -    VLiveChannelIE +    VLiveChannelIE, +    VLivePlaylistIE  )  from .vodlocker import VodlockerIE  from .vodpl import VODPlIE  from .vodplatform import VODPlatformIE  from .voicerepublic import VoiceRepublicIE +from .voot import VootIE  from .voxmedia import VoxMediaIE  from .vporn import VpornIE  from .vrt import VRTIE @@ -1214,6 +1261,7 @@ from .washingtonpost import (      WashingtonPostArticleIE,  )  from .wat import WatIE +from .watchbox import WatchBoxIE  from .watchindianporn import WatchIndianPornIE  from .wdr import (      WDRIE, @@ -1263,12 +1311,12 @@ from .yahoo import (      YahooIE,      YahooSearchIE,  ) -from .yam import YamIE  from .yandexmusic import (      YandexMusicTrackIE,      YandexMusicAlbumIE,      YandexMusicPlaylistIE,  ) +from .yandexdisk import YandexDiskIE  from .yesjapan import YesJapanIE  from .yinyuetai import YinYueTaiIE  from .ynet import YnetIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b69c1ede0..4b3f6cc86 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -203,19 +203,19 @@ class FacebookIE(InfoExtractor):      }]      @staticmethod -    def _extract_url(webpage): -        mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) -        if mobj is not None: -            return mobj.group('url') - +    def _extract_urls(webpage): +        urls = [] +        for mobj in re.finditer( +                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', +                webpage): +            urls.append(mobj.group('url'))          # Facebook API embed          # see https://developers.facebook.com/docs/plugins/embedded-video-player -        mobj = re.search(r'''(?x)<div[^>]+ +        for mobj in re.finditer(r'''(?x)<div[^>]+                  class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ -                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) -        if mobj is not None: -            return mobj.group('url') +                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): +            urls.append(mobj.group('url')) +        return urls      def _login(self):          (useremail, password) = self._get_login_info() diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 081c71842..4803a22c8 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -102,6 +102,8 @@ class FirstTVIE(InfoExtractor):                      'format_id': f.get('name'),                      'tbr': tbr,                      'source_preference': quality(f.get('name')), +                    # quality metadata of http formats may be incorrect +                    'preference': -1,                  })              # m3u8 URL format is reverse engineered from [1] (search for              # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py index 15736c9fe..9f9863746 100644 --- a/youtube_dl/extractor/fivetv.py +++ b/youtube_dl/extractor/fivetv.py @@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor):          'info_dict': {              'id': 'glavnoe',              'ext': 'mp4', -            'title': 'Итоги недели с 8 по 14 июня 2015 года', +            'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$',              'thumbnail': r're:^https?://.*\.jpg$',          },      }, { @@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          video_url = self._search_regex( -            r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', +            [r'<div[^>]+?class="flowplayer[^>]+?data-href="([^"]+)"', +             r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],              webpage, 'video url')          title = self._og_search_title(webpage, default=None) or self._search_regex( diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index a8e1bf42a..9f166efd4 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,10 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import ( +    compat_str, +    compat_urllib_parse_urlencode, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -81,7 +84,7 @@ class FlickrIE(InfoExtractor):              formats = []              for stream in streams['stream']: -                stream_type = str(stream.get('type')) +                stream_type = compat_str(stream.get('type'))                  formats.append({                      'format_id': stream_type,                      'url': stream['_content'], diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 9776c8422..ad273a0e7 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,39 +3,22 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_urlparse  from ..utils import (      parse_duration,      parse_iso8601, -    sanitized_Request,      str_to_int,  ) -class FourTubeIE(InfoExtractor): -    IE_NAME = '4tube' -    _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)' +class FourTubeBaseIE(InfoExtractor): +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') -    _TEST = { -        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', -        'md5': '6516c8ac63b03de06bc8eac14362db4f', -        'info_dict': { -            'id': '209733', -            'ext': 'mp4', -            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', -            'uploader': 'WCP Club', -            'uploader_id': 'wcp-club', -            'upload_date': '20131031', -            'timestamp': 1383263892, -            'duration': 583, -            'view_count': int, -            'like_count': int, -            'categories': list, -            'age_limit': 18, -        } -    } +        if kind == 'm' or not display_id: +            url = self._URL_TEMPLATE % video_id -    def _real_extract(self, url): -        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          title = self._html_search_meta('name', webpage) @@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor):              'uploadDate', webpage))          thumbnail = self._html_search_meta('thumbnailUrl', webpage)          uploader_id = self._html_search_regex( -            r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', +            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">',              webpage, 'uploader id', fatal=False)          uploader = self._html_search_regex( -            r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', +            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">',              webpage, 'uploader', fatal=False)          categories_html = self._search_regex( @@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor):          view_count = str_to_int(self._search_regex(              r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', -            webpage, 'view count', fatal=False)) +            webpage, 'view count', default=None))          like_count = str_to_int(self._search_regex(              r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', -            webpage, 'like count', fatal=False)) +            webpage, 'like count', default=None))          duration = parse_duration(self._html_search_meta('duration', webpage))          media_id = self._search_regex( @@ -85,14 +68,14 @@ class FourTubeIE(InfoExtractor):              media_id = params[0]              sources = ['%s' % p for p in params[2]] -        token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( +        token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format(              media_id, '+'.join(sources)) -        headers = { -            b'Content-Type': b'application/x-www-form-urlencoded', -            b'Origin': b'http://www.4tube.com', -        } -        token_req = sanitized_Request(token_url, b'{}', headers) -        tokens = self._download_json(token_req, video_id) + +        parsed_url = compat_urlparse.urlparse(url) +        tokens = self._download_json(token_url, video_id, data=b'', headers={ +            'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), +            'Referer': url, +        })          formats = [{              'url': tokens[format]['token'],              'format_id': format + 'p', @@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor):              'duration': duration,              'age_limit': 18,          } + + +class FourTubeIE(FourTubeBaseIE): +    IE_NAME = '4tube' +    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' +    _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' +    _TESTS = [{ +        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', +        'md5': '6516c8ac63b03de06bc8eac14362db4f', +        'info_dict': { +            'id': '209733', +            'ext': 'mp4', +            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', +            'uploader': 'WCP Club', +            'uploader_id': 'wcp-club', +            'upload_date': '20131031', +            'timestamp': 1383263892, +            'duration': 583, +            'view_count': int, +            'like_count': int, +            'categories': list, +            'age_limit': 18, +        }, +    }, { +        'url': 'http://www.4tube.com/embed/209733', +        'only_matching': True, +    }, { +        'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', +        'only_matching': True, +    }] + + +class FuxIE(FourTubeBaseIE): +    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' +    _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' +    _TESTS = [{ +        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', +        'info_dict': { +            'id': '195359', +            'ext': 'mp4', +            'title': 'Awesome fucking in the kitchen ends with cum swallow', +            'uploader': 'alenci2342', +            'uploader_id': 'alenci2342', +            'upload_date': '20131230', +            'timestamp': 1388361660, +            'duration': 289, +            'view_count': int, +            'like_count': int, +            'categories': list, +            'age_limit': 18, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'https://www.fux.com/embed/195359', +        'only_matching': True, +    }, { +        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', +        'only_matching': True, +    }] + + +class PornTubeIE(FourTubeBaseIE): +    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' +    _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' +    _TESTS = [{ +        'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', +        'info_dict': { +            'id': '7089759', +            'ext': 'mp4', +            'title': 'Teen couple doing anal', +            'uploader': 'Alexy', +            'uploader_id': 'Alexy', +            'upload_date': '20150606', +            'timestamp': 1433595647, +            'duration': 5052, +            'view_count': int, +            'like_count': int, +            'categories': list, +            'age_limit': 18, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'https://www.porntube.com/embed/7089759', +        'only_matching': True, +    }, { +        'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', +        'only_matching': True, +    }] + + +class PornerBrosIE(FourTubeBaseIE): +    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' +    _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' +    _TESTS = [{ +        'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', +        'md5': '6516c8ac63b03de06bc8eac14362db4f', +        'info_dict': { +            'id': '181369', +            'ext': 'mp4', +            'title': 'Skinny brunette takes big cock down her anal hole', +            'uploader': 'PornerBros HD', +            'uploader_id': 'pornerbros-hd', +            'upload_date': '20130130', +            'timestamp': 1359527401, +            'duration': 1224, +            'view_count': int, +            'categories': list, +            'age_limit': 18, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'https://www.pornerbros.com/embed/181369', +        'only_matching': True, +    }, { +        'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', +        'only_matching': True, +    }] diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9c4..facc665f6 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,56 +3,99 @@ from __future__ import unicode_literals  from .adobepass import AdobePassIE  from ..utils import ( -    smuggle_url, -    update_url_query, +    int_or_none, +    parse_age_limit, +    parse_duration, +    try_get, +    unified_timestamp,  )  class FOXIE(AdobePassIE): -    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.fox.com/watch/255180355939/7684182528', +    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' +    _TESTS = [{ +        # clip +        'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',          'md5': 'ebd296fcc41dd4b19f8115d8461a3165',          'info_dict': { -            'id': '255180355939', +            'id': '4b765a60490325103ea69888fb2bd4e8',              'ext': 'mp4', -            'title': 'Official Trailer: Gotham', -            'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', -            'duration': 129, -            'timestamp': 1400020798, -            'upload_date': '20140513', -            'uploader': 'NEWA-FNG-FOXCOM', +            'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', +            'description': 'md5:549cd9c70d413adb32ce2a779b53b486', +            'duration': 102, +            'timestamp': 1504291893, +            'upload_date': '20170901', +            'creator': 'FOX', +            'series': 'Gotham',          }, -        'add_ie': ['ThePlatform'], -    } +        'params': { +            'skip_download': True, +        }, +    }, { +        # episode, geo-restricted +        'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', +        'only_matching': True, +    }, { +        # episode, geo-restricted, tv provided required +        'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        settings = self._parse_json(self._search_regex( -            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', -            webpage, 'drupal settings'), video_id) -        fox_pdk_player = settings['fox_pdk_player'] -        release_url = fox_pdk_player['release_url'] -        query = { -            'mbr': 'true', -            'switch': 'http' -        } -        if fox_pdk_player.get('access') == 'locked': -            ap_p = settings['foxAdobePassProvider'] -            rating = ap_p.get('videoRating') -            if rating == 'n/a': -                rating = None -            resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) -            query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) - -        info = self._search_json_ld(webpage, video_id, fatal=False) -        info.update({ -            '_type': 'url_transparent', -            'ie_key': 'ThePlatform', -            'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), -            'id': video_id, -        }) -        return info +        video = self._download_json( +            'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, +            video_id, headers={ +                'apikey': 'abdcbed02c124d393b39e818a4312055', +                'Content-Type': 'application/json', +                'Referer': url, +            }) + +        title = video['name'] + +        m3u8_url = self._download_json( +            video['videoRelease']['url'], video_id)['playURL'] + +        formats = self._extract_m3u8_formats( +            m3u8_url, video_id, 'mp4', +            entry_protocol='m3u8_native', m3u8_id='hls') +        self._sort_formats(formats) + +        description = video.get('description') +        duration = int_or_none(video.get('durationInSeconds')) or int_or_none( +            video.get('duration')) or parse_duration(video.get('duration')) +        timestamp = unified_timestamp(video.get('datePublished')) +        age_limit = parse_age_limit(video.get('contentRating')) + +        data = try_get( +            video, lambda x: x['trackingData']['properties'], dict) or {} + +        creator = data.get('brand') or data.get('network') or video.get('network') + +        series = video.get('seriesName') or data.get( +            'seriesName') or data.get('show') +        season_number = int_or_none(video.get('seasonNumber')) +        episode = video.get('name') +        episode_number = int_or_none(video.get('episodeNumber')) +        release_year = int_or_none(video.get('releaseYear')) + +        if data.get('authRequired'): +            # TODO: AP +            pass + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'age_limit': age_limit, +            'creator': creator, +            'series': series, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number, +            'release_year': release_year, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index e887ae488..512a10645 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -5,6 +5,7 @@ import itertools  from .common import InfoExtractor  from ..utils import (      get_element_by_id, +    int_or_none,      remove_end,  ) @@ -46,7 +47,7 @@ class FoxgayIE(InfoExtractor):          formats = [{              'url': source, -            'height': resolution, +            'height': int_or_none(resolution),          } for source, resolution in zip(              video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 48d43ae58..2bcbb3e39 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,11 +21,13 @@ from .dailymotion import (  class FranceTVBaseInfoExtractor(InfoExtractor): -    def _extract_video(self, video_id, catalogue): +    def _extract_video(self, video_id, catalogue=None):          info = self._download_json( -            'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' -            % (video_id, catalogue), -            video_id, 'Downloading video JSON') +            'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', +            video_id, 'Downloading video JSON', query={ +                'idDiffusion': video_id, +                'catalogue': catalogue or '', +            })          if info.get('status') == 'NOK':              raise ExtractorError( @@ -109,27 +111,100 @@ class FranceTVBaseInfoExtractor(InfoExtractor):          } -class PluzzIE(FranceTVBaseInfoExtractor): -    IE_NAME = 'pluzz.francetv.fr' -    _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' +class FranceTVIE(FranceTVBaseInfoExtractor): +    _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html' -    # Can't use tests, videos expire in 7 days +    _TESTS = [{ +        'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', +        'info_dict': { +            'id': '157550144', +            'ext': 'mp4', +            'title': '13h15, le dimanche... - Les mystères de Jésus', +            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', +            'timestamp': 1494156300, +            'upload_date': '20170507', +        }, +        'params': { +            # m3u8 downloads +            'skip_download': True, +        }, +    }, { +        # france3 +        'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', +        'only_matching': True, +    }, { +        # france4 +        'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', +        'only_matching': True, +    }, { +        # france5 +        'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', +        'only_matching': True, +    }, { +        # franceo +        'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', +        'only_matching': True, +    }, { +        # france2 live +        'url': 'https://www.france.tv/france-2/direct.html', +        'only_matching': True, +    }, { +        'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', +        'only_matching': True, +    }, { +        'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', +        'only_matching': True, +    }, { +        'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', +        'only_matching': True, +    }, { +        'url': 'https://www.france.tv/142749-rouge-sang.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        video_id = self._html_search_meta( -            'id_video', webpage, 'video id', default=None) +        catalogue = None +        video_id = self._search_regex( +            r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', +            webpage, 'video id', default=None, group='id') +          if not video_id: -            video_id = self._search_regex( -                r'data-diffusion=["\'](\d+)', webpage, 'video id') +            video_id, catalogue = self._html_search_regex( +                r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', +                webpage, 'video ID').split('@') +        return self._extract_video(video_id, catalogue) -        return self._extract_video(video_id, 'Pluzz') +class FranceTVEmbedIE(FranceTVBaseInfoExtractor): +    _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' -class FranceTvInfoIE(FranceTVBaseInfoExtractor): +    _TEST = { +        'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', +        'info_dict': { +            'id': 'NI_983319', +            'ext': 'mp4', +            'title': 'Le Pen Reims', +            'upload_date': '20170505', +            'timestamp': 1493981780, +            'duration': 16, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        video = self._download_json( +            'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, +            video_id) + +        return self._extract_video(video['video_id'], video.get('catalog')) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor):      IE_NAME = 'francetvinfo.fr'      _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' @@ -233,124 +308,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):          return self._extract_video(video_id, catalogue) -class FranceTVIE(FranceTVBaseInfoExtractor): -    IE_NAME = 'francetv' -    IE_DESC = 'France 2, 3, 4, 5 and Ô' -    _VALID_URL = r'''(?x) -                    https?:// -                        (?: -                            (?:www\.)?france[2345o]\.fr/ -                                (?: -                                    emissions/[^/]+/(?:videos|diffusions)| -                                    emission/[^/]+| -                                    videos| -                                    jt -                                ) -                            /| -                            embed\.francetv\.fr/\?ue= -                        ) -                        (?P<id>[^/?]+) -                    ''' - -    _TESTS = [ -        # france2 -        { -            'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', -            'md5': 'c03fc87cb85429ffd55df32b9fc05523', -            'info_dict': { -                'id': '109169362', -                'ext': 'flv', -                'title': '13h15, le dimanche...', -                'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', -                'upload_date': '20140914', -                'timestamp': 1410693600, -            }, -        }, -        # france3 -        { -            'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', -            'md5': '679bb8f8921f8623bd658fa2f8364da0', -            'info_dict': { -                'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', -                'ext': 'mp4', -                'title': 'Le scandale du prix des médicaments', -                'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', -                'upload_date': '20131113', -                'timestamp': 1384380000, -            }, -        }, -        # france4 -        { -            'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', -            'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', -            'info_dict': { -                'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', -                'ext': 'mp4', -                'title': 'Hero Corp Making of - Extrait 1', -                'description': 'md5:c87d54871b1790679aec1197e73d650a', -                'upload_date': '20131106', -                'timestamp': 1383766500, -            }, -        }, -        # france5 -        { -            'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', -            'md5': 'f6c577df3806e26471b3d21631241fd0', -            'info_dict': { -                'id': '123327454', -                'ext': 'flv', -                'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', -                'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', -                'upload_date': '20150831', -                'timestamp': 1441035120, -            }, -        }, -        # franceo -        { -            'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', -            'md5': '47d5816d3b24351cdce512ad7ab31da8', -            'info_dict': { -                'id': '125377621', -                'ext': 'flv', -                'title': 'Infô soir', -                'description': 'md5:01b8c6915a3d93d8bbbd692651714309', -                'upload_date': '20150718', -                'timestamp': 1437241200, -                'duration': 414, -            }, -        }, -        { -            # francetv embed -            'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', -            'info_dict': { -                'id': 'EV_30231', -                'ext': 'flv', -                'title': 'Alcaline, le concert avec Calogero', -                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', -                'upload_date': '20150226', -                'timestamp': 1424989860, -                'duration': 5400, -            }, -        }, -        { -            'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', -            'only_matching': True, -        }, -        { -            'url': 'http://www.franceo.fr/videos/125377617', -            'only_matching': True, -        } -    ] - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        video_id, catalogue = self._html_search_regex( -            r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', -            webpage, 'video ID').split('@') -        return self._extract_video(video_id, catalogue) - -  class GenerationQuoiIE(InfoExtractor):      IE_NAME = 'france2.fr:generation-quoi'      _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index e44a2a87f..8c37509ec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,15 +2,11 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import ( -    compat_HTTPError, -    compat_urllib_parse_unquote_plus, -) +from ..compat import compat_HTTPError  from ..utils import (      determine_ext,      int_or_none,      js_to_json, -    sanitized_Request,      ExtractorError,      urlencode_postdata  ) @@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'      _NETRC_MACHINE = 'funimation' +    _TOKEN = None      _TESTS = [{          'url': 'https://www.funimation.com/shows/hacksign/role-play/', @@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor):      }, {          'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',          'info_dict': { -            'id': '9635', +            'id': '210051',              'display_id': 'broadcast-dub-preview',              'ext': 'mp4',              'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', -            'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',              'thumbnail': r're:https?://.*\.(?:jpg|png)',          }, -        'skip': 'Access without user interaction is forbidden by CloudFlare', +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }, {          'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',          'only_matching': True,      }] -    _LOGIN_URL = 'http://www.funimation.com/login' - -    def _extract_cloudflare_session_ua(self, url): -        ci_session_cookie = self._get_cookies(url).get('ci_session') -        if ci_session_cookie: -            ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) -            # ci_session is a string serialized by PHP function serialize() -            # This case is simple enough to use regular expressions only -            return self._search_regex( -                r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', -                default=None) -      def _login(self):          (username, password) = self._get_login_info()          if username is None:              return -        data = urlencode_postdata({ -            'email_field': username, -            'password_field': password, -        }) -        user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) -        if not user_agent: -            user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' -        login_request = sanitized_Request(self._LOGIN_URL, data, headers={ -            'User-Agent': user_agent, -            'Content-Type': 'application/x-www-form-urlencoded' -        }) -        login_page = self._download_webpage( -            login_request, None, 'Logging in as %s' % username) -        if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): -            return -        error = self._html_search_regex( -            r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', -            login_page, 'error messages', default=None) -        if error: -            raise ExtractorError('Unable to login: %s' % error, expected=True) -        raise ExtractorError('Unable to log in') +        try: +            data = self._download_json( +                'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', +                None, 'Logging in as %s' % username, data=urlencode_postdata({ +                    'username': username, +                    'password': password, +                })) +            self._TOKEN = data['token'] +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: +                error = self._parse_json(e.cause.read().decode(), None)['error'] +                raise ExtractorError(error, expected=True) +            raise      def _real_initialize(self):          self._login() @@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor):          description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)          try: +            headers = {} +            if self._TOKEN: +                headers['Authorization'] = 'Token %s' % self._TOKEN              sources = self._download_json(                  'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, -                video_id)['items'] +                video_id, headers=headers)['items']          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:                  error = self._parse_json(e.cause.read(), video_id)['errors'][0] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 49409369c..f85e7de14 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,10 +1,14 @@  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    float_or_none, +    int_or_none, +    unified_timestamp, +)  class FunnyOrDieIE(InfoExtractor): @@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):              'title': 'Heart-Shaped Box: Literal Video Version',              'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',              'thumbnail': r're:^http:.*\.jpg$', +            'uploader': 'DASjr', +            'timestamp': 1317904928, +            'upload_date': '20111006', +            'duration': 318.3,          },      }, {          'url': 'http://www.funnyordie.com/embed/e402820827', @@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):              'title': 'Please Use This Song (Jon Lajoie)',              'description': 'Please use this to sell something.  www.jonlajoie.com',              'thumbnail': r're:^http:.*\.jpg$', +            'timestamp': 1398988800, +            'upload_date': '20140502',          },          'params': {              'skip_download': True, @@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):                  'url': 'http://www.funnyordie.com%s' % src,              }] -        post_json = self._search_regex( -            r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') -        post = json.loads(post_json) +        timestamp = unified_timestamp(self._html_search_meta( +            'uploadDate', webpage, 'timestamp', default=None)) + +        uploader = self._html_search_regex( +            r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h', +            webpage, 'uploader', default=None) + +        title, description, thumbnail, duration = [None] * 4 + +        medium = self._parse_json( +            self._search_regex( +                r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium', +                default='{}'), +            video_id, fatal=False) +        if medium: +            title = medium.get('title') +            duration = float_or_none(medium.get('duration')) +            if not timestamp: +                timestamp = unified_timestamp(medium.get('publishDate')) + +        post = self._parse_json( +            self._search_regex( +                r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details', +                default='{}'), +            video_id, fatal=False) +        if post: +            if not title: +                title = post.get('name') +            description = post.get('description') +            thumbnail = post.get('picture') + +        if not title: +            title = self._og_search_title(webpage) +        if not description: +            description = self._og_search_description(webpage) +        if not duration: +            duration = int_or_none(self._html_search_meta( +                ('video:duration', 'duration'), webpage, 'duration', default=False))          return {              'id': video_id, -            'title': post['name'], -            'description': post.get('description'), -            'thumbnail': post.get('picture'), +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'timestamp': timestamp, +            'duration': duration,              'formats': formats,              'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 36ba7d8cf..1726a6704 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -6,62 +6,52 @@ from .common import InfoExtractor  from ..utils import (      float_or_none,      int_or_none, -    js_to_json,      unified_strdate,  )  class GaskrankIE(InfoExtractor): -    """InfoExtractor for gaskrank.tv""" -    _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.html?' -    _TESTS = [ -        { -            'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', -            'md5': '1ae88dbac97887d85ebd1157a95fc4f9', -            'info_dict': { -                'id': '201601/26955', -                'ext': 'mp4', -                'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', -                'thumbnail': r're:^https?://.*\.jpg$', -                'categories': ['motorrad-fun'], -                'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', -                'uploader_id': 'Bikefun', -                'upload_date': '20170110', -                'uploader_url': None, -            } -        }, -        { -            'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', -            'md5': 'c33ee32c711bc6c8224bfcbe62b23095', -            'info_dict': { -                'id': '201106/15920', -                'ext': 'mp4', -                'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', -                'thumbnail': r're:^https?://.*\.jpg$', -                'categories': ['racing'], -                'display_id': 'isle-of-man-tt-2011-michael-du-15920', -                'uploader_id': 'IOM', -                'upload_date': '20160506', -                'uploader_url': 'www.iomtt.com', -            } +    _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm' +    _TESTS = [{ +        'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', +        'md5': '1ae88dbac97887d85ebd1157a95fc4f9', +        'info_dict': { +            'id': '201601/26955', +            'ext': 'mp4', +            'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', +            'thumbnail': r're:^https?://.*\.jpg$', +            'categories': ['motorrad-fun'], +            'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', +            'uploader_id': 'Bikefun', +            'upload_date': '20170110', +            'uploader_url': None,          } -    ] +    }, { +        'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', +        'md5': 'c33ee32c711bc6c8224bfcbe62b23095', +        'info_dict': { +            'id': '201106/15920', +            'ext': 'mp4', +            'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', +            'thumbnail': r're:^https?://.*\.jpg$', +            'categories': ['racing'], +            'display_id': 'isle-of-man-tt-2011-michael-du-15920', +            'uploader_id': 'IOM', +            'upload_date': '20170523', +            'uploader_url': 'www.iomtt.com', +        } +    }]      def _real_extract(self, url): -        """extract information from gaskrank.tv""" -        def fix_json(code): -            """Removes trailing comma in json: {{},} --> {{}}""" -            return re.sub(r',\s*}', r'}', js_to_json(code)) -          display_id = self._match_id(url) +          webpage = self._download_webpage(url, display_id) + +        title = self._og_search_title( +            webpage, default=None) or self._html_search_meta( +            'title', webpage, fatal=True) +          categories = [re.match(self._VALID_URL, url).group('categories')] -        title = self._search_regex( -            r'movieName\s*:\s*\'([^\']*)\'', -            webpage, 'title') -        thumbnail = self._search_regex( -            r'poster\s*:\s*\'([^\']*)\'', -            webpage, 'thumbnail', default=None)          mobj = re.search(              r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', @@ -89,29 +79,14 @@ class GaskrankIE(InfoExtractor):          if average_rating:              average_rating = float_or_none(average_rating.replace(',', '.')) -        playlist = self._parse_json( -            self._search_regex( -                r'playlist\s*:\s*\[([^\]]*)\]', -                webpage, 'playlist', default='{}'), -            display_id, transform_source=fix_json, fatal=False) -          video_id = self._search_regex(              r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', -            playlist.get('0').get('src'), 'video id') - -        formats = [] -        for key in playlist: -            formats.append({ -                'url': playlist[key]['src'], -                'format_id': key, -                'quality': playlist[key].get('quality')}) -        self._sort_formats(formats, field_preference=['format_id']) +            webpage, 'video id', default=display_id) -        return { +        entry = self._parse_html5_media_entries(url, webpage, video_id)[0] +        entry.update({              'id': video_id,              'title': title, -            'formats': formats, -            'thumbnail': thumbnail,              'categories': categories,              'display_id': display_id,              'uploader_id': uploader_id, @@ -120,4 +95,7 @@ class GaskrankIE(InfoExtractor):              'tags': tags,              'view_count': view_count,              'average_rating': average_rating, -        } +        }) +        self._sort_formats(entry['formats']) + +        return entry diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3136427db..f71d9092e 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor):                  'format': 'jp',  # The japanese audio              }          }, +        { +            # gdc-player.html +            'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', +            'info_dict': { +                'id': '1435', +                'display_id': 'An-American-engine-in-Tokyo', +                'ext': 'flv', +                'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', +            }, +            'params': { +                'skip_download': True,  # Requires rtmpdump +            }, +        },      ]      def _login(self, webpage_url, display_id): @@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor):                  'title': title,              } -        PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' +        PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'          xml_root = self._html_search_regex(              PLAYER_REGEX, start_page, 'xml root', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b06f43446..b83c18380 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -10,6 +10,7 @@ from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import (      compat_etree_fromstring, +    compat_str,      compat_urllib_parse_unquote,      compat_urlparse,      compat_xml_parse_error, @@ -35,6 +36,10 @@ from .brightcove import (      BrightcoveLegacyIE,      BrightcoveNewIE,  ) +from .nexx import ( +    NexxIE, +    NexxEmbedIE, +)  from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE @@ -56,6 +61,7 @@ from .dailymotion import (      DailymotionIE,      DailymotionCloudIE,  ) +from .dailymail import DailyMailIE  from .onionstudios import OnionStudiosIE  from .viewlift import ViewLiftEmbedIE  from .mtv import MTVServicesEmbeddedIE @@ -88,6 +94,11 @@ from .rutube import RutubeIE  from .limelight import LimelightBaseIE  from .anvato import AnvatoIE  from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE  class GenericIE(InfoExtractor): @@ -565,6 +576,19 @@ class GenericIE(InfoExtractor):              },              'skip': 'movie expired',          }, +        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js +        { +            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', +            'info_dict': { +                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', +                'ext': 'mp4', +                'title': 'Steampunk Fest Comes to Honesdale', +                'duration': 43.276, +            }, +            'params': { +                'skip_download': True, +            } +        },          # embed.ly video          {              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -756,6 +780,20 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Dailymotion'],          }, +        # DailyMail embed +        { +            'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', +            'info_dict': { +                'id': '1495629', +                'ext': 'mp4', +                'title': 'Care worker punches elderly dementia patient in head 11 times', +                'description': 'md5:3a743dee84e57e48ec68bf67113199a5', +            }, +            'add_ie': ['DailyMail'], +            'params': { +                'skip_download': True, +            }, +        },          # YouTube embed          {              'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', @@ -1182,7 +1220,7 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Kaltura'],          }, -        # Eagle.Platform embed (generic URL) +        # EaglePlatform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/',              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1196,8 +1234,26 @@ class GenericIE(InfoExtractor):                  'view_count': int,                  'age_limit': 0,              }, +            'params': { +                'skip_download': True, +            }, +        }, +        # referrer protected EaglePlatform embed +        { +            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', +            'info_dict': { +                'id': '582306', +                'ext': 'mp4', +                'title': 'Стас Намин: «Мы нарушили девственность Кремля»', +                'thumbnail': r're:^https?://.*\.jpg$', +                'duration': 3382, +                'view_count': int, +            }, +            'params': { +                'skip_download': True, +            },          }, -        # ClipYou (Eagle.Platform) embed (custom URL) +        # ClipYou (EaglePlatform) embed (custom URL)          {              'url': 'http://muz-tv.ru/play/7129/',              # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1209,6 +1265,9 @@ class GenericIE(InfoExtractor):                  'duration': 216,                  'view_count': int,              }, +            'params': { +                'skip_download': True, +            },          },          # Pladform embed          { @@ -1460,14 +1519,27 @@ class GenericIE(InfoExtractor):          # LiveLeak embed          {              'url': 'http://www.wykop.pl/link/3088787/', -            'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', +            'md5': '7619da8c820e835bef21a1efa2a0fc71',              'info_dict': {                  'id': '874_1459135191',                  'ext': 'mp4',                  'title': 'Man shows poor quality of new apartment building',                  'description': 'The wall is like a sand pile.',                  'uploader': 'Lake8737', -            } +            }, +            'add_ie': [LiveLeakIE.ie_key()], +        }, +        # Another LiveLeak embed pattern (#13336) +        { +            'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', +            'info_dict': { +                'id': '2eb_1496309988', +                'ext': 'mp4', +                'title': 'Thief robs place where everyone was armed', +                'description': 'md5:694d73ee79e535953cf2488562288eee', +                'uploader': 'brazilwtf', +            }, +            'add_ie': [LiveLeakIE.ie_key()],          },          # Duplicated embedded video URLs          { @@ -1509,6 +1581,22 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['BrightcoveLegacy'],          }, +        # Nexx embed +        { +            'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', +            'info_dict': { +                'id': '247746', +                'ext': 'mp4', +                'title': "Yesterday's Jam (OV)", +                'description': 'md5:09bc0984723fed34e2581624a84e05f0', +                'timestamp': 1492594816, +                'upload_date': '20170419', +            }, +            'params': { +                'format': 'bestvideo', +                'skip_download': True, +            }, +        },          # Facebook <iframe> embed          {              'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', @@ -1519,6 +1607,21 @@ class GenericIE(InfoExtractor):                  'title': 'Facebook video #599637780109885',              },          }, +        # Facebook <iframe> embed, plugin video +        { +            'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', +            'info_dict': { +                'id': '1754168231264132', +                'ext': 'mp4', +                'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', +                'uploader': 'Tariq Ramadan (official)', +                'timestamp': 1496758379, +                'upload_date': '20170606', +            }, +            'params': { +                'skip_download': True, +            }, +        },          # Facebook API embed          {              'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', @@ -1696,6 +1799,21 @@ class GenericIE(InfoExtractor):              'playlist_mincount': 5,          },          { +            # Limelight embed (LimelightPlayerUtil.embed) +            'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', +            'info_dict': { +                'id': '95d035dc5c8a401588e9c0e6bd1e9c92', +                'ext': 'mp4', +                'title': '07448641', +                'timestamp': 1499890639, +                'upload_date': '20170712', +            }, +            'params': { +                'skip_download': True, +            }, +            'add_ie': ['LimelightMedia'], +        }, +        {              'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',              'info_dict': {                  'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', @@ -1718,6 +1836,49 @@ class GenericIE(InfoExtractor):              },              'add_ie': [WashingtonPostIE.ie_key()],          }, +        { +            # Mediaset embed +            'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', +            'info_dict': { +                'id': '720642', +                'ext': 'mp4', +                'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', +            }, +            'params': { +                'skip_download': True, +            }, +            'add_ie': [MediasetIE.ie_key()], +        }, +        { +            # JOJ.sk embeds +            'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', +            'info_dict': { +                'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', +                'title': 'Slovenskom sa prehnala vlna silných búrok', +            }, +            'playlist_mincount': 5, +            'add_ie': [JojIE.ie_key()], +        }, +        { +            # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) +            'url': 'https://tvrain.ru/amp/418921/', +            'md5': 'cc00413936695987e8de148b67d14f1d', +            'info_dict': { +                'id': '418921', +                'ext': 'mp4', +                'title': 'Стас Намин: «Мы нарушили девственность Кремля»', +            }, +        }, +        { +            # vzaar embed +            'url': 'http://help.vzaar.com/article/165-embedding-video', +            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', +            'info_dict': { +                'id': '8707641', +                'ext': 'mp4', +                'title': 'Building A Business Online: Principal Chairs Q & A', +            }, +        },          # {          #     # TODO: find another test          #     # http://schema.org/VideoObject @@ -1867,7 +2028,7 @@ class GenericIE(InfoExtractor):          if head_response is not False:              # Check for redirect -            new_url = head_response.geturl() +            new_url = compat_str(head_response.geturl())              if url != new_url:                  self.report_following_redirect(new_url)                  if force_videoid: @@ -1892,14 +2053,14 @@ class GenericIE(InfoExtractor):          content_type = head_response.headers.get('Content-Type', '').lower()          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)          if m: -            format_id = m.group('format_id') +            format_id = compat_str(m.group('format_id'))              if format_id.endswith('mpegurl'):                  formats = self._extract_m3u8_formats(url, video_id, 'mp4')              elif format_id == 'f4m':                  formats = self._extract_f4m_formats(url, video_id)              else:                  formats = [{ -                    'format_id': m.group('format_id'), +                    'format_id': format_id,                      'url': url,                      'vcodec': 'none' if m.group('type') == 'audio' else None                  }] @@ -1968,7 +2129,7 @@ class GenericIE(InfoExtractor):              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):                  info_dict['formats'] = self._parse_mpd_formats(                      doc, video_id, -                    mpd_base_url=full_response.geturl().rpartition('/')[0], +                    mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],                      mpd_url=url)                  self._sort_formats(info_dict['formats'])                  return info_dict @@ -2017,6 +2178,13 @@ class GenericIE(InfoExtractor):          video_description = self._og_search_description(webpage, default=None)          video_thumbnail = self._og_search_thumbnail(webpage, default=None) +        info_dict.update({ +            'title': video_title, +            'description': video_description, +            'thumbnail': video_thumbnail, +            'age_limit': age_limit, +        }) +          # Look for Brightcove Legacy Studio embeds          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)          if bc_urls: @@ -2038,6 +2206,16 @@ class GenericIE(InfoExtractor):          if bc_urls:              return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') +        # Look for Nexx embeds +        nexx_urls = NexxIE._extract_urls(webpage) +        if nexx_urls: +            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) + +        # Look for Nexx iFrame embeds +        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) +        if nexx_embed_urls: +            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) +          # Look for ThePlatform embeds          tp_urls = ThePlatformIE._extract_urls(webpage)          if tp_urls: @@ -2065,36 +2243,11 @@ class GenericIE(InfoExtractor):          if vid_me_embed_url is not None:              return self.url_result(vid_me_embed_url, 'Vidme') -        # Look for embedded YouTube player -        matches = re.findall(r'''(?x) -            (?: -                <iframe[^>]+?src=| -                data-video-url=| -                <embed[^>]+?src=| -                embedSWF\(?:\s*| -                <object[^>]+data=| -                new\s+SWFObject\( -            ) -            (["\']) -                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ -                (?:embed|v|p)/.+?) -            \1''', webpage) -        if matches: +        # Look for YouTube embeds +        youtube_urls = YoutubeIE._extract_urls(webpage) +        if youtube_urls:              return self.playlist_from_matches( -                matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - -        # Look for lazyYT YouTube embed -        matches = re.findall( -            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) -        if matches: -            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - -        # Look for Wordpress "YouTube Video Importer" plugin -        matches = re.findall(r'''(?x)<div[^>]+ -            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ -            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) -        if matches: -            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) +                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())          matches = DailymotionIE._extract_urls(webpage)          if matches: @@ -2110,58 +2263,27 @@ class GenericIE(InfoExtractor):                  return self.playlist_from_matches(                      playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) -        # Look for embedded Wistia player -        match = re.search( -            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) -        if match: -            embed_url = self._proto_relative_url( -                unescapeHTML(match.group('url'))) -            return { -                '_type': 'url_transparent', -                'url': embed_url, -                'ie_key': 'Wistia', -                'uploader': video_uploader, -            } +        # Look for DailyMail embeds +        dailymail_urls = DailyMailIE._extract_urls(webpage) +        if dailymail_urls: +            return self.playlist_from_matches( +                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) -        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) -        if match: +        # Look for embedded Wistia player +        wistia_url = WistiaIE._extract_url(webpage) +        if wistia_url:              return {                  '_type': 'url_transparent', -                'url': 'wistia:%s' % match.group('id'), -                'ie_key': 'Wistia', +                'url': self._proto_relative_url(wistia_url), +                'ie_key': WistiaIE.ie_key(),                  'uploader': video_uploader,              } -        match = re.search( -            r'''(?sx) -                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? -                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 -            ''', webpage) -        if match: -            return self.url_result(self._proto_relative_url( -                'wistia:%s' % match.group('id')), 'Wistia') -          # Look for SVT player          svt_url = SVTIE._extract_url(webpage)          if svt_url:              return self.url_result(svt_url, 'SVT') -        # Look for embedded condenast player -        matches = re.findall( -            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', -            webpage) -        if matches: -            return { -                '_type': 'playlist', -                'entries': [{ -                    '_type': 'url', -                    'ie_key': 'CondeNast', -                    'url': ma, -                } for ma in matches], -                'title': video_title, -                'id': video_id, -            } -          # Look for Bandcamp pages with custom domain          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)          if mobj is not None: @@ -2198,6 +2320,7 @@ class GenericIE(InfoExtractor):          # Look for Ooyala videos          mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or +                re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))          if mobj is not None: @@ -2243,9 +2366,9 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'))          # Look for embedded Facebook player -        facebook_url = FacebookIE._extract_url(webpage) -        if facebook_url is not None: -            return self.url_result(facebook_url, 'Facebook') +        facebook_urls = FacebookIE._extract_urls(webpage) +        if facebook_urls: +            return self.playlist_from_matches(facebook_urls, video_id, video_title)          # Look for embedded VK player          mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -2442,12 +2565,12 @@ class GenericIE(InfoExtractor):          if kaltura_url:              return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) -        # Look for Eagle.Platform embeds +        # Look for EaglePlatform embeds          eagleplatform_url = EaglePlatformIE._extract_url(webpage)          if eagleplatform_url: -            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) +            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) -        # Look for ClipYou (uses Eagle.Platform) embeds +        # Look for ClipYou (uses EaglePlatform) embeds          mobj = re.search(              r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)          if mobj is not None: @@ -2555,29 +2678,6 @@ class GenericIE(InfoExtractor):              return self.playlist_result(                  limelight_urls, video_id, video_title, video_description) -        mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) -        if mobj: -            lm = { -                'Media': 'media', -                'Channel': 'channel', -                'ChannelList': 'channel_list', -            } -            return self.url_result(smuggle_url('limelight:%s:%s' % ( -                lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), -                'Limelight%s' % mobj.group(1), mobj.group(2)) - -        mobj = re.search( -            r'''(?sx) -                <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? -                    <param[^>]+ -                        name=(["\'])flashVars\2[^>]+ -                        value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) -            ''', webpage) -        if mobj: -            return self.url_result(smuggle_url( -                'limelight:media:%s' % mobj.group('id'), -                {'source_url': url}), 'LimelightMedia', mobj.group('id')) -          # Look for Anvato embeds          anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)          if anvato_urls: @@ -2645,9 +2745,9 @@ class GenericIE(InfoExtractor):                  self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())          # Look for LiveLeak embeds -        liveleak_url = LiveLeakIE._extract_url(webpage) -        if liveleak_url: -            return self.url_result(liveleak_url, 'LiveLeak') +        liveleak_urls = LiveLeakIE._extract_urls(webpage) +        if liveleak_urls: +            return self.playlist_from_matches(liveleak_urls, video_id, video_title)          # Look for 3Q SDN embeds          threeqsdn_url = ThreeQSDNIE._extract_url(webpage) @@ -2699,7 +2799,7 @@ class GenericIE(InfoExtractor):          rutube_urls = RutubeIE._extract_urls(webpage)          if rutube_urls:              return self.playlist_from_matches( -                rutube_urls, ie=RutubeIE.ie_key()) +                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())          # Look for WashingtonPost embeds          wapo_urls = WashingtonPostIE._extract_urls(webpage) @@ -2707,18 +2807,44 @@ class GenericIE(InfoExtractor):              return self.playlist_from_matches(                  wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) -        # Looking for http://schema.org/VideoObject -        json_ld = self._search_json_ld( -            webpage, video_id, default={}, expected_type='VideoObject') -        if json_ld.get('url'): -            info_dict.update({ -                'title': video_title or info_dict['title'], -                'description': video_description, -                'thumbnail': video_thumbnail, -                'age_limit': age_limit -            }) -            info_dict.update(json_ld) -            return info_dict +        # Look for Mediaset embeds +        mediaset_urls = MediasetIE._extract_urls(webpage) +        if mediaset_urls: +            return self.playlist_from_matches( +                mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + +        # Look for JOJ.sk embeds +        joj_urls = JojIE._extract_urls(webpage) +        if joj_urls: +            return self.playlist_from_matches( +                joj_urls, video_id, video_title, ie=JojIE.ie_key()) + +        # Look for megaphone.fm embeds +        mpfn_urls = MegaphoneIE._extract_urls(webpage) +        if mpfn_urls: +            return self.playlist_from_matches( +                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + +        # Look for vzaar embeds +        vzaar_urls = VzaarIE._extract_urls(webpage) +        if vzaar_urls: +            return self.playlist_from_matches( +                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + +        def merge_dicts(dict1, dict2): +            merged = {} +            for k, v in dict1.items(): +                if v is not None: +                    merged[k] = v +            for k, v in dict2.items(): +                if v is None: +                    continue +                if (k not in merged or +                        (isinstance(v, compat_str) and v and +                            isinstance(merged[k], compat_str) and +                            not merged[k])): +                    merged[k] = v +            return merged          # Look for HTML5 media          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') @@ -2736,9 +2862,13 @@ class GenericIE(InfoExtractor):          if jwplayer_data:              info = self._parse_jwplayer_data(                  jwplayer_data, video_id, require_title=False, base_url=url) -            if not info.get('title'): -                info['title'] = video_title -            return info +            return merge_dicts(info, info_dict) + +        # Looking for http://schema.org/VideoObject +        json_ld = self._search_json_ld( +            webpage, video_id, default={}, expected_type='VideoObject') +        if json_ld.get('url'): +            return merge_dicts(json_ld, info_dict)          def check_video(vurl):              if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index 884700c52..45ccc11c1 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -82,7 +82,7 @@ class GfycatIE(InfoExtractor):              video_url = gfy.get('%sUrl' % format_id)              if not video_url:                  continue -            filesize = gfy.get('%sSize' % format_id) +            filesize = int_or_none(gfy.get('%sSize' % format_id))              formats.append({                  'url': video_url,                  'format_id': format_id, diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py index 29b684d35..6a1b1e96e 100644 --- a/youtube_dl/extractor/giantbomb.py +++ b/youtube_dl/extractor/giantbomb.py @@ -5,9 +5,10 @@ import json  from .common import InfoExtractor  from ..utils import ( -    unescapeHTML, -    qualities, +    determine_ext,      int_or_none, +    qualities, +    unescapeHTML,  ) @@ -15,7 +16,7 @@ class GiantBombIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'      _TEST = {          'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', -        'md5': '57badeface303ecf6b98b812de1b9018', +        'md5': 'c8ea694254a59246a42831155dec57ac',          'info_dict': {              'id': '2300-9782',              'display_id': 'quick-look-destiny-the-dark-below', @@ -51,11 +52,16 @@ class GiantBombIE(InfoExtractor):          for format_id, video_url in video['videoStreams'].items():              if format_id == 'f4m_stream':                  continue -            if video_url.endswith('.f4m'): +            ext = determine_ext(video_url) +            if ext == 'f4m':                  f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)                  if f4m_formats:                      f4m_formats[0]['quality'] = quality(format_id)                      formats.extend(f4m_formats) +            elif ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    video_url, display_id, ext='mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False))              else:                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/godtv.py b/youtube_dl/extractor/godtv.py deleted file mode 100644 index c5d3b4e6a..000000000 --- a/youtube_dl/extractor/godtv.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..utils import js_to_json - - -class GodTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?god\.tv(?:/[^/]+)*/(?P<id>[^/?#&]+)' -    _TESTS = [{ -        'url': 'http://god.tv/jesus-image/video/jesus-conference-2016/randy-needham', -        'info_dict': { -            'id': 'lpd3g2MzE6D1g8zFAKz8AGpxWcpu6o_3', -            'ext': 'mp4', -            'title': 'Randy Needham', -            'duration': 3615.08, -        }, -        'params': { -            'skip_download': True, -        } -    }, { -        'url': 'http://god.tv/playlist/bible-study', -        'info_dict': { -            'id': 'bible-study', -        }, -        'playlist_mincount': 37, -    }, { -        'url': 'http://god.tv/node/15097', -        'only_matching': True, -    }, { -        'url': 'http://god.tv/live/africa', -        'only_matching': True, -    }, { -        'url': 'http://god.tv/liveevents', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        display_id = self._match_id(url) - -        webpage = self._download_webpage(url, display_id) - -        settings = self._parse_json( -            self._search_regex( -                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', -                webpage, 'settings', default='{}'), -            display_id, transform_source=js_to_json, fatal=False) - -        ooyala_id = None - -        if settings: -            playlist = settings.get('playlist') -            if playlist and isinstance(playlist, list): -                entries = [ -                    OoyalaIE._build_url_result(video['content_id']) -                    for video in playlist if video.get('content_id')] -                if entries: -                    return self.playlist_result(entries, display_id) -            ooyala_id = settings.get('ooyala', {}).get('content_id') - -        if not ooyala_id: -            ooyala_id = self._search_regex( -                r'["\']content_id["\']\s*:\s*(["\'])(?P<id>[\w-]+)\1', -                webpage, 'ooyala id', group='id') - -        return OoyalaIE._build_url_result(ooyala_id) diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index 2bfb99040..47a068e74 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..compat import ( +    compat_str,      compat_urlparse,  )  from ..utils import ( @@ -46,7 +47,7 @@ class GolemIE(InfoExtractor):                  continue              formats.append({ -                'format_id': e.tag, +                'format_id': compat_str(e.tag),                  'url': compat_urlparse.urljoin(self._PREFIX, url),                  'height': self._int(e.get('height'), 'height'),                  'width': self._int(e.get('width'), 'width'), diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index fec36cbbb..3bf462d63 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,17 +4,30 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      ExtractorError,      int_or_none,      lowercase_escape, +    update_url_query,  )  class GoogleDriveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' +    _VALID_URL = r'''(?x) +                        https?:// +                            (?: +                                (?:docs|drive)\.google\.com/ +                                (?: +                                    (?:uc|open)\?.*?id=| +                                    file/d/ +                                )| +                                video\.google\.com/get_player\?.*?docid= +                            ) +                            (?P<id>[a-zA-Z0-9_-]{28,}) +                    '''      _TESTS = [{          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', -        'md5': 'd109872761f7e7ecf353fa108c0dbe1e', +        'md5': '5c602afbbf2c1db91831f5d82f678554',          'info_dict': {              'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',              'ext': 'mp4', @@ -22,8 +35,30 @@ class GoogleDriveIE(InfoExtractor):              'duration': 45,          }      }, { +        # video can't be watched anonymously due to view count limit reached, +        # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) +        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', +        'md5': 'bfbd670d03a470bb1e6d4a257adec12e', +        'info_dict': { +            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', +            'ext': 'mp4', +            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', +        } +    }, {          # video id is longer than 28 characters          'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', +        'info_dict': { +            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', +            'ext': 'mp4', +            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', +            'duration': 189, +        }, +        'only_matching': True, +    }, { +        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', +        'only_matching': True, +    }, { +        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',          'only_matching': True,      }]      _FORMATS_EXT = { @@ -44,6 +79,13 @@ class GoogleDriveIE(InfoExtractor):          '46': 'webm',          '59': 'mp4',      } +    _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' +    _CAPTIONS_ENTRY_TAG = { +        'subtitles': 'track', +        'automatic_captions': 'target', +    } +    _caption_formats_ext = [] +    _captions_xml = None      @staticmethod      def _extract_url(webpage): @@ -53,41 +95,183 @@ class GoogleDriveIE(InfoExtractor):          if mobj:              return 'https://drive.google.com/file/d/%s' % mobj.group('id') +    def _download_subtitles_xml(self, video_id, subtitles_id, hl): +        if self._captions_xml: +            return +        self._captions_xml = self._download_xml( +            self._BASE_URL_CAPTIONS, video_id, query={ +                'id': video_id, +                'vid': subtitles_id, +                'hl': hl, +                'v': video_id, +                'type': 'list', +                'tlangs': '1', +                'fmts': '1', +                'vssids': '1', +            }, note='Downloading subtitles XML', +            errnote='Unable to download subtitles XML', fatal=False) +        if self._captions_xml: +            for f in self._captions_xml.findall('format'): +                if f.attrib.get('fmt_code') and not f.attrib.get('default'): +                    self._caption_formats_ext.append(f.attrib['fmt_code']) + +    def _get_captions_by_type(self, video_id, subtitles_id, caption_type, +                              origin_lang_code=None): +        if not subtitles_id or not caption_type: +            return +        captions = {} +        for caption_entry in self._captions_xml.findall( +                self._CAPTIONS_ENTRY_TAG[caption_type]): +            caption_lang_code = caption_entry.attrib.get('lang_code') +            if not caption_lang_code: +                continue +            caption_format_data = [] +            for caption_format in self._caption_formats_ext: +                query = { +                    'vid': subtitles_id, +                    'v': video_id, +                    'fmt': caption_format, +                    'lang': (caption_lang_code if origin_lang_code is None +                             else origin_lang_code), +                    'type': 'track', +                    'name': '', +                    'kind': '', +                } +                if origin_lang_code is not None: +                    query.update({'tlang': caption_lang_code}) +                caption_format_data.append({ +                    'url': update_url_query(self._BASE_URL_CAPTIONS, query), +                    'ext': caption_format, +                }) +            captions[caption_lang_code] = caption_format_data +        return captions + +    def _get_subtitles(self, video_id, subtitles_id, hl): +        if not subtitles_id or not hl: +            return +        self._download_subtitles_xml(video_id, subtitles_id, hl) +        if not self._captions_xml: +            return +        return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') + +    def _get_automatic_captions(self, video_id, subtitles_id, hl): +        if not subtitles_id or not hl: +            return +        self._download_subtitles_xml(video_id, subtitles_id, hl) +        if not self._captions_xml: +            return +        track = self._captions_xml.find('track') +        if track is None: +            return +        origin_lang_code = track.attrib.get('lang_code') +        if not origin_lang_code: +            return +        return self._get_captions_by_type( +            video_id, subtitles_id, 'automatic_captions', origin_lang_code) +      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(              'http://docs.google.com/file/d/%s' % video_id, video_id) -        reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) -        if reason: -            raise ExtractorError(reason) - -        title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') +        title = self._search_regex( +            r'"title"\s*,\s*"([^"]+)', webpage, 'title', +            default=None) or self._og_search_title(webpage)          duration = int_or_none(self._search_regex( -            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) -        fmt_stream_map = self._search_regex( -            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') -        fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') +            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', +            default=None))          formats = [] -        for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): -            fmt_id, fmt_url = fmt_stream.split('|') -            resolution = fmt.split('/')[1] -            width, height = resolution.split('x') -            formats.append({ -                'url': lowercase_escape(fmt_url), -                'format_id': fmt_id, -                'resolution': resolution, -                'width': int_or_none(width), -                'height': int_or_none(height), -                'ext': self._FORMATS_EXT[fmt_id], +        fmt_stream_map = self._search_regex( +            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, +            'fmt stream map', default='').split(',') +        fmt_list = self._search_regex( +            r'"fmt_list"\s*,\s*"([^"]+)', webpage, +            'fmt_list', default='').split(',') +        if fmt_stream_map and fmt_list: +            resolutions = {} +            for fmt in fmt_list: +                mobj = re.search( +                    r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) +                if mobj: +                    resolutions[mobj.group('format_id')] = ( +                        int(mobj.group('width')), int(mobj.group('height'))) + +            for fmt_stream in fmt_stream_map: +                fmt_stream_split = fmt_stream.split('|') +                if len(fmt_stream_split) < 2: +                    continue +                format_id, format_url = fmt_stream_split[:2] +                f = { +                    'url': lowercase_escape(format_url), +                    'format_id': format_id, +                    'ext': self._FORMATS_EXT[format_id], +                } +                resolution = resolutions.get(format_id) +                if resolution: +                    f.update({ +                        'width': resolution[0], +                        'height': resolution[1], +                    }) +                formats.append(f) + +        source_url = update_url_query( +            'https://drive.google.com/uc', { +                'id': video_id, +                'export': 'download',              }) +        urlh = self._request_webpage( +            source_url, video_id, note='Requesting source file', +            errnote='Unable to request source file', fatal=False) +        if urlh: +            def add_source_format(src_url): +                formats.append({ +                    'url': src_url, +                    'ext': determine_ext(title, 'mp4').lower(), +                    'format_id': 'source', +                    'quality': 1, +                }) +            if urlh.headers.get('Content-Disposition'): +                add_source_format(source_url) +            else: +                confirmation_webpage = self._webpage_read_content( +                    urlh, url, video_id, note='Downloading confirmation page', +                    errnote='Unable to confirm download', fatal=False) +                if confirmation_webpage: +                    confirm = self._search_regex( +                        r'confirm=([^&"\']+)', confirmation_webpage, +                        'confirmation code', fatal=False) +                    if confirm: +                        add_source_format(update_url_query(source_url, { +                            'confirm': confirm, +                        })) + +        if not formats: +            reason = self._search_regex( +                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) +            if reason: +                raise ExtractorError(reason, expected=True) +          self._sort_formats(formats) +        hl = self._search_regex( +            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) +        subtitles_id = None +        ttsurl = self._search_regex( +            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) +        if ttsurl: +            # the video Id for subtitles will be the last value in the ttsurl +            # query string +            subtitles_id = ttsurl.encode('utf-8').decode( +                'unicode_escape').split('=')[-1] +          return {              'id': video_id,              'title': title,              'thumbnail': self._og_search_thumbnail(webpage, default=None),              'duration': duration,              'formats': formats, +            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), +            'automatic_captions': self.extract_automatic_captions( +                video_id, subtitles_id, hl),          } diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py index e854300c7..a4f332565 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/youtube_dl/extractor/hgtv.py @@ -7,14 +7,19 @@ from .common import InfoExtractor  class HGTVComShowIE(InfoExtractor):      IE_NAME = 'hgtv.com:show'      _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P<id>[^/?#&]+)' -    _TEST = { -        'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', +    _TESTS = [{ +        # data-module="video" +        'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-season-4-videos',          'info_dict': { -            'id': 'flip-or-flop-full-episodes-videos', +            'id': 'flip-or-flop-full-episodes-season-4-videos',              'title': 'Flip or Flop Full Episodes',          },          'playlist_mincount': 15, -    } +    }, { +        # data-deferred-module="video" +        'url': 'http://www.hgtv.com/shows/good-bones/episodes/an-old-victorian-house-gets-a-new-facelift', +        'only_matching': True, +    }]      def _real_extract(self, url):          display_id = self._match_id(url) @@ -23,7 +28,7 @@ class HGTVComShowIE(InfoExtractor):          config = self._parse_json(              self._search_regex( -                r'(?s)data-module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script', +                r'(?s)data-(?:deferred-)?module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',                  webpage, 'video config'),              display_id)['channels'][0] diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index e21ebb8fb..1d905dc81 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -16,8 +16,8 @@ from ..utils import (  class HitboxIE(InfoExtractor):      IE_NAME = 'hitbox' -    _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' +    _TESTS = [{          'url': 'http://www.hitbox.tv/video/203213',          'info_dict': {              'id': '203213', @@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor):              # m3u8 download              'skip_download': True,          }, -    } +    }, { +        'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', +        'only_matching': True, +    }]      def _extract_metadata(self, url, video_id):          thumb_base = 'https://edge.sf.hitbox.tv'          metadata = self._download_json( -            '%s/%s' % (url, video_id), video_id, -            'Downloading metadata JSON') +            '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON')          date = 'media_live_since'          media_type = 'livestream' @@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor):          views = int_or_none(video_meta.get('media_views'))          timestamp = parse_iso8601(video_meta.get(date), ' ')          categories = [video_meta.get('category_name')] -        thumbs = [ -            {'url': thumb_base + video_meta.get('media_thumbnail'), -             'width': 320, -             'height': 180}, -            {'url': thumb_base + video_meta.get('media_thumbnail_large'), -             'width': 768, -             'height': 432}, -        ] +        thumbs = [{ +            'url': thumb_base + video_meta.get('media_thumbnail'), +            'width': 320, +            'height': 180 +        }, { +            'url': thumb_base + video_meta.get('media_thumbnail_large'), +            'width': 768, +            'height': 432 +        }]          return {              'id': video_id, @@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor):          video_id = self._match_id(url)          player_config = self._download_json( -            'https://www.hitbox.tv/api/player/config/video/%s' % video_id, +            'https://www.smashcast.tv/api/player/config/video/%s' % video_id,              video_id, 'Downloading video JSON')          formats = [] @@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor):          self._sort_formats(formats)          metadata = self._extract_metadata( -            'https://www.hitbox.tv/api/media/video', -            video_id) +            'https://www.smashcast.tv/api/media/video', video_id)          metadata['formats'] = formats          return metadata @@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor):  class HitboxLiveIE(HitboxIE):      IE_NAME = 'hitbox:live' -    _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' +    _TESTS = [{          'url': 'http://www.hitbox.tv/dimak',          'info_dict': {              'id': 'dimak', @@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE):              # live              'skip_download': True,          }, -    } +    }, { +        'url': 'https://www.smashcast.tv/dimak', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url)      def _real_extract(self, url):          video_id = self._match_id(url)          player_config = self._download_json( -            'https://www.hitbox.tv/api/player/config/live/%s' % video_id, +            'https://www.smashcast.tv/api/player/config/live/%s' % video_id,              video_id)          formats = [] @@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE):          self._sort_formats(formats)          metadata = self._extract_metadata( -            'https://www.hitbox.tv/api/media/live', -            video_id) +            'https://www.smashcast.tv/api/media/live', video_id)          metadata['formats'] = formats          metadata['is_live'] = True          metadata['title'] = self._live_title(metadata.get('title')) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index c45c68c1d..c1367cf51 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,6 +89,11 @@ class IGNIE(InfoExtractor):              'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',              'only_matching': True,          }, +        { +            # videoId pattern +            'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', +            'only_matching': True, +        },      ]      def _find_video_id(self, webpage): @@ -98,6 +103,8 @@ class IGNIE(InfoExtractor):              r'data-video-id="(.+?)"',              r'<object id="vid_(.+?)"',              r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', +            r'videoId"\s*:\s*"(.+?)"', +            r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',          ]          return self._search_regex(res_id, webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f95c00c73..3ff672a89 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import (  class ImdbIE(InfoExtractor):      IE_NAME = 'imdb'      IE_DESC = 'Internet Movie Database trailers' -    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor):      }, {          'url': 'http://www.imdb.com/videoplayer/vi1562949145',          'only_matching': True, +    }, { +        'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index f3156804d..26c48e4b8 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -59,12 +59,18 @@ class ITVIE(InfoExtractor):          def _add_sub_element(element, name):              return etree.SubElement(element, _add_ns(name)) +        production_id = ( +            params.get('data-video-autoplay-id') or +            '%s#001' % ( +                params.get('data-video-episode-id') or +                video_id.replace('a', '/'))) +          req_env = etree.Element(_add_ns('soapenv:Envelope'))          _add_sub_element(req_env, 'soapenv:Header')          body = _add_sub_element(req_env, 'soapenv:Body')          get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))          request = _add_sub_element(get_playlist, 'tem:request') -        _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id'] +        _add_sub_element(request, 'itv:ProductionId').text = production_id          _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()          vodcrid = _add_sub_element(request, 'itv:Vodcrid')          _add_sub_element(vodcrid, 'com:Id') diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py new file mode 100755 index 000000000..a764023e9 --- /dev/null +++ b/youtube_dl/extractor/joj.py @@ -0,0 +1,100 @@ +# coding: utf-8
 +from __future__ import unicode_literals
 +
 +import re
 +
 +from .common import InfoExtractor
 +from ..compat import compat_str
 +from ..utils import (
 +    int_or_none,
 +    js_to_json,
 +    try_get,
 +)
 +
 +
 +class JojIE(InfoExtractor):
 +    _VALID_URL = r'''(?x)
 +                    (?:
 +                        joj:|
 +                        https?://media\.joj\.sk/embed/
 +                    )
 +                    (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
 +                '''
 +    _TESTS = [{
 +        'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
 +        'info_dict': {
 +            'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
 +            'ext': 'mp4',
 +            'title': 'NOVÉ BÝVANIE',
 +            'thumbnail': r're:^https?://.*\.jpg$',
 +            'duration': 3118,
 +        }
 +    }, {
 +        'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
 +        'only_matching': True,
 +    }]
 +
 +    @staticmethod
 +    def _extract_urls(webpage):
 +        return re.findall(
 +            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
 +            webpage)
 +
 +    def _real_extract(self, url):
 +        video_id = self._match_id(url)
 +
 +        webpage = self._download_webpage(
 +            'https://media.joj.sk/embed/%s' % video_id, video_id)
 +
 +        title = self._search_regex(
 +            (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
 +             r'<title>(?P<title>[^<]+)'), webpage, 'title',
 +            default=None, group='title') or self._og_search_title(webpage)
 +
 +        bitrates = self._parse_json(
 +            self._search_regex(
 +                r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',
 +                default='{}'),
 +            video_id, transform_source=js_to_json, fatal=False)
 +
 +        formats = []
 +        for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
 +            if isinstance(format_url, compat_str):
 +                height = self._search_regex(
 +                    r'(\d+)[pP]\.', format_url, 'height', default=None)
 +                formats.append({
 +                    'url': format_url,
 +                    'format_id': '%sp' % height if height else None,
 +                    'height': int(height),
 +                })
 +        if not formats:
 +            playlist = self._download_xml(
 +                'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
 +                video_id)
 +            for file_el in playlist.findall('./files/file'):
 +                path = file_el.get('path')
 +                if not path:
 +                    continue
 +                format_id = file_el.get('id') or file_el.get('label')
 +                formats.append({
 +                    'url': 'http://n16.joj.sk/storage/%s' % path.replace(
 +                        'dat/', '', 1),
 +                    'format_id': format_id,
 +                    'height': int_or_none(self._search_regex(
 +                        r'(\d+)[pP]', format_id or path, 'height',
 +                        default=None)),
 +                })
 +        self._sort_formats(formats)
 +
 +        thumbnail = self._og_search_thumbnail(webpage)
 +
 +        duration = int_or_none(self._search_regex(
 +            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
 +
 +        return {
 +            'id': video_id,
 +            'title': title,
 +            'thumbnail': thumbnail,
 +            'duration': duration,
 +            'formats': formats,
 +        }
 diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py index f9a034b78..27e0e37f6 100644 --- a/youtube_dl/extractor/jove.py +++ b/youtube_dl/extractor/jove.py @@ -65,9 +65,9 @@ class JoveIE(InfoExtractor):              webpage, 'description', fatal=False)          publish_date = unified_strdate(self._html_search_meta(              'citation_publication_date', webpage, 'publish date', fatal=False)) -        comment_count = self._html_search_regex( +        comment_count = int(self._html_search_regex(              r'<meta name="num_comments" content="(\d+) Comments?"', -            webpage, 'comment count', fatal=False) +            webpage, 'comment count', fatal=False))          return {              'id': video_id, diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 41c1f3d96..138d4844d 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -324,7 +324,7 @@ class KalturaIE(InfoExtractor):          if captions:              for caption in captions.get('objects', []):                  # Continue if caption is not ready -                if f.get('status') != 2: +                if caption.get('status') != 2:                      continue                  if not caption.get('id'):                      continue diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index 4e9eb67bf..f236a2f78 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -48,7 +48,7 @@ class KarriereVideosIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          title = (self._html_search_meta('title', webpage, default=None) or -                 self._search_regex(r'<h1 class="title">([^<]+)</h1>')) +                 self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title'))          video_id = self._search_regex(              r'/config/video/(.+?)\.xml', webpage, 'video id') diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 3190b187c..c7f813370 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,6 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals +import json +  from .common import InfoExtractor  from ..utils import (      ExtractorError, @@ -8,15 +10,15 @@ from ..utils import (      urlencode_postdata,      xpath_element,      xpath_text, -    urljoin,      update_url_query, +    js_to_json,  )  class Laola1TvEmbedIE(InfoExtractor):      IE_NAME = 'laola1tv:embed'      _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          # flashvars.premium = "false";          'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',          'info_dict': { @@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor):              'uploader': 'ITTF - International Table Tennis Federation',              'upload_date': '20161211',          }, -    } +    }] + +    def _extract_token_url(self, stream_access_url, video_id, data): +        return self._download_json( +            stream_access_url, video_id, headers={ +                'Content-Type': 'application/json', +            }, data=json.dumps(data).encode())['data']['stream-access'][0] + +    def _extract_formats(self, token_url, video_id): +        token_doc = self._download_xml( +            token_url, video_id, 'Downloading token', +            headers=self.geo_verification_headers()) + +        token_attrib = xpath_element(token_doc, './/token').attrib + +        if token_attrib['status'] != '0': +            raise ExtractorError( +                'Token error: %s' % token_attrib['comment'], expected=True) + +        formats = self._extract_akamai_formats( +            '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), +            video_id) +        self._sort_formats(formats) +        return formats      def _real_extract(self, url):          video_id = self._match_id(url) @@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor):          else:              data_abo = urlencode_postdata(                  dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) -            token_url = self._download_json( -                'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', -                video_id, query={ +            stream_access_url = update_url_query( +                'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {                      'videoId': _v('id'),                      'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),                      'label': _v('label'),                      'area': _v('area'), -                }, data=data_abo)['data']['stream-access'][0] - -        token_doc = self._download_xml( -            token_url, video_id, 'Downloading token', -            headers=self.geo_verification_headers()) - -        token_attrib = xpath_element(token_doc, './/token').attrib +                }) +            token_url = self._extract_token_url(stream_access_url, video_id, data_abo) -        if token_attrib['status'] != '0': -            raise ExtractorError( -                'Token error: %s' % token_attrib['comment'], expected=True) - -        formats = self._extract_akamai_formats( -            '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), -            video_id) -        self._sort_formats(formats) +        formats = self._extract_formats(token_url, video_id)          categories_str = _v('meta_sports')          categories = categories_str.split(',') if categories_str else [] @@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor):          } -class Laola1TvIE(InfoExtractor): +class Laola1TvIE(Laola1TvEmbedIE):      IE_NAME = 'laola1tv'      _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'      _TESTS = [{ @@ -164,13 +176,60 @@ class Laola1TvIE(InfoExtractor):          if 'Dieser Livestream ist bereits beendet.' in webpage:              raise ExtractorError('This live stream has already finished.', expected=True) -        iframe_url = urljoin(url, self._search_regex( -            r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', -            webpage, 'iframe url')) +        conf = self._parse_json(self._search_regex( +            r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), +            display_id, js_to_json) + +        video_id = conf['videoid'] + +        config = self._download_json(conf['configUrl'], video_id, query={ +            'videoid': video_id, +            'partnerid': conf['partnerid'], +            'language': conf.get('language', ''), +            'portal': conf.get('portalid', ''), +        }) +        error = config.get('error') +        if error: +            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + +        video_data = config['video'] +        title = video_data['title'] +        is_live = video_data.get('isLivestream') and video_data.get('isLive') +        meta = video_data.get('metaInformation') +        sports = meta.get('sports') +        categories = sports.split(',') if sports else [] + +        token_url = self._extract_token_url( +            video_data['streamAccess'], video_id, +            video_data['abo']['required']) + +        formats = self._extract_formats(token_url, video_id)          return { -            '_type': 'url', +            'id': video_id,              'display_id': display_id, -            'url': iframe_url, -            'ie_key': 'Laola1TvEmbed', +            'title': self._live_title(title) if is_live else title, +            'description': video_data.get('description'), +            'thumbnail': video_data.get('image'), +            'categories': categories, +            'formats': formats, +            'is_live': is_live,          } + + +class ITTFIE(InfoExtractor): +    _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)' +    _TEST = { +        'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', +        'only_matching': True, +    } + +    def _real_extract(self, url): +        return self.url_result( +            update_url_query('https://www.laola1.tv/titanplayer.php', { +                'videoid': self._match_id(url), +                'type': 'V', +                'lang': 'en', +                'portal': 'int', +                'customer': 1024, +            }), Laola1TvEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 0a5a3956c..ad65b2759 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor):              'Channel': 'channel',              'ChannelList': 'channel_list',          } + +        def smuggle(url): +            return smuggle_url(url, {'source_url': source_url}) +          entries = []          for kind, video_id in re.findall(                  r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',                  webpage):              entries.append(cls.url_result( -                smuggle_url( -                    'limelight:%s:%s' % (lm[kind], video_id), -                    {'source_url': source_url}), +                smuggle('limelight:%s:%s' % (lm[kind], video_id)),                  'Limelight%s' % kind, video_id))          for mobj in re.finditer(                  # As per [1] class attribute should be exactly equal to @@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor):                  ''', webpage):              kind, video_id = mobj.group('kind'), mobj.group('id')              entries.append(cls.url_result( -                smuggle_url( -                    'limelight:%s:%s' % (kind, video_id), -                    {'source_url': source_url}), +                smuggle('limelight:%s:%s' % (kind, video_id)),                  'Limelight%s' % kind.capitalize(), video_id)) +        # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) +        for video_id in re.findall( +                r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', +                webpage): +            entries.append(cls.url_result( +                smuggle('limelight:media:%s' % video_id), +                LimelightMediaIE.ie_key(), video_id))          return entries      def _call_playlist_service(self, item_id, method, fatal=True, referer=None): diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c7de65353..246aac576 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor @@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor):      _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'      _TESTS = [{          'url': 'http://www.liveleak.com/view?i=757_1364311680', -        'md5': '50f79e05ba149149c1b4ea961223d5b3', +        'md5': '0813c2430bea7a46bf13acf3406992f4',          'info_dict': {              'id': '757_1364311680', -            'ext': 'flv', +            'ext': 'mp4',              'description': 'extremely bad day for this guy..!',              'uploader': 'ljfriel2',              'title': 'Most unlucky car accident', @@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor):          }      }, {          'url': 'http://www.liveleak.com/view?i=f93_1390833151', -        'md5': 'b13a29626183c9d33944e6a04f41aafc', +        'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',          'info_dict': {              'id': 'f93_1390833151',              'ext': 'mp4', @@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor):              'thumbnail': r're:^https?://.*\.jpg$'          }      }, { +        # Prochan embed          'url': 'http://www.liveleak.com/view?i=4f7_1392687779',          'md5': '42c6d97d54f1db107958760788c5f48f',          'info_dict': { @@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor):              'uploader': 'CapObveus',              'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',              'age_limit': 18, -        } +        }, +        'skip': 'Video is dead',      }, {          # Covers https://github.com/rg3/youtube-dl/pull/5983 +        # Multiple resolutions          'url': 'http://www.liveleak.com/view?i=801_1409392012', -        'md5': '0b3bec2d888c20728ca2ad3642f0ef15', +        'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',          'info_dict': {              'id': '801_1409392012',              'ext': 'mp4', @@ -70,15 +72,20 @@ class LiveLeakIE(InfoExtractor):          'params': {              'skip_download': True,          }, +    }, { +        'url': 'https://www.liveleak.com/view?i=677_1439397581', +        'info_dict': { +            'id': '677_1439397581', +            'title': 'Fuel Depot in China Explosion caught on video', +        }, +        'playlist_count': 3,      }]      @staticmethod -    def _extract_url(webpage): -        mobj = re.search( -            r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)', +    def _extract_urls(webpage): +        return re.findall( +            r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"',              webpage) -        if mobj: -            return 'http://www.liveleak.com/view?i=%s' % mobj.group('id')      def _real_extract(self, url):          video_id = self._match_id(url) @@ -93,57 +100,70 @@ class LiveLeakIE(InfoExtractor):              webpage, 'age limit', default=None))          video_thumbnail = self._og_search_thumbnail(webpage) -        sources_raw = self._search_regex( -            r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) -        if sources_raw is None: -            alt_source = self._search_regex( -                r'(file: ".*?"),', webpage, 'video URL', default=None) -            if alt_source: -                sources_raw = '[{ %s}]' % alt_source +        entries = self._parse_html5_media_entries(url, webpage, video_id) +        if not entries: +            # Maybe an embed? +            embed_url = self._search_regex( +                r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', +                webpage, 'embed URL') +            return { +                '_type': 'url_transparent', +                'url': embed_url, +                'id': video_id, +                'title': video_title, +                'description': video_description, +                'uploader': video_uploader, +                'age_limit': age_limit, +            } + +        for idx, info_dict in enumerate(entries): +            for a_format in info_dict['formats']: +                if not a_format.get('height'): +                    a_format['height'] = int_or_none(self._search_regex( +                        r'([0-9]+)p\.mp4', a_format['url'], 'height label', +                        default=None)) + +            self._sort_formats(info_dict['formats']) + +            # Don't append entry ID for one-video pages to keep backward compatibility +            if len(entries) > 1: +                info_dict['id'] = '%s_%s' % (video_id, idx + 1)              else: -                # Maybe an embed? -                embed_url = self._search_regex( -                    r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', -                    webpage, 'embed URL') -                return { -                    '_type': 'url_transparent', -                    'url': embed_url, -                    'id': video_id, -                    'title': video_title, -                    'description': video_description, -                    'uploader': video_uploader, -                    'age_limit': age_limit, -                } - -        sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) -        sources = json.loads(sources_json) - -        formats = [{ -            'format_id': '%s' % i, -            'format_note': s.get('label'), -            'url': s['file'], -        } for i, s in enumerate(sources)] - -        for i, s in enumerate(sources): -            # Removing '.h264_*.mp4' gives the raw video, which is essentially -            # the same video without the LiveLeak logo at the top (see -            # https://github.com/rg3/youtube-dl/pull/4768) -            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) -            if s['file'] != orig_url: -                formats.append({ -                    'format_id': 'original-%s' % i, -                    'format_note': s.get('label'), -                    'url': orig_url, -                    'preference': 1, -                }) -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': video_title, -            'description': video_description, -            'uploader': video_uploader, -            'formats': formats, -            'age_limit': age_limit, -            'thumbnail': video_thumbnail, -        } +                info_dict['id'] = video_id + +            info_dict.update({ +                'title': video_title, +                'description': video_description, +                'uploader': video_uploader, +                'age_limit': age_limit, +                'thumbnail': video_thumbnail, +            }) + +        return self.playlist_result(entries, video_id, video_title) + + +class LiveLeakEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)' + +    # See generic.py for actual test cases +    _TESTS = [{ +        'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', +        'only_matching': True, +    }, { +        'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        kind, video_id = mobj.group('kind', 'id') + +        if kind == 'f': +            webpage = self._download_webpage(url, video_id) +            liveleak_url = self._search_regex( +                r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, +                webpage, 'LiveLeak URL', group='url') +        elif kind == 'i': +            liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + +        return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py new file mode 100644 index 000000000..b94b3c2ab --- /dev/null +++ b/youtube_dl/extractor/manyvids.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class ManyVidsIE(InfoExtractor): +    _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' +    _TEST = { +        'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', +        'md5': '03f11bb21c52dd12a05be21a5c7dcc97', +        'info_dict': { +            'id': '133957', +            'ext': 'mp4', +            'title': 'everthing about me (Preview)', +            'view_count': int, +            'like_count': int, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._search_regex( +            r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', +            webpage, 'video URL', group='url') + +        title = '%s (Preview)' % self._html_search_regex( +            r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + +        like_count = int_or_none(self._search_regex( +            r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) +        view_count = int_or_none(self._html_search_regex( +            r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, +            'view count', default=None)) + +        return { +            'id': video_id, +            'title': title, +            'view_count': view_count, +            'like_count': like_count, +            'formats': [{ +                'url': video_url, +            }], +        } diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 6e067474b..4c32fbc2c 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -17,7 +17,7 @@ from ..utils import (  class MedialaanIE(InfoExtractor):      _VALID_URL = r'''(?x)                      https?:// -                        (?:www\.)? +                        (?:www\.|nieuws\.)?                          (?:                              (?P<site_id>vtm|q2|vtmkzoom)\.be/                              (?: @@ -85,6 +85,22 @@ class MedialaanIE(InfoExtractor):          # clip          'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',          'only_matching': True, +    }, { +        # http/s redirect +        'url': 'https://vtmkzoom.be/video?aid=45724', +        'info_dict': { +            'id': '257136373657000', +            'ext': 'mp4', +            'title': 'K3 Dansstudio Ushuaia afl.6', +        }, +        'params': { +            'skip_download': True, +        }, +        'skip': 'Requires account credentials', +    }, { +        # nieuws.vtm.be +        'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', +        'only_matching': True,      }]      def _real_initialize(self): @@ -146,6 +162,8 @@ class MedialaanIE(InfoExtractor):                  video_id, transform_source=lambda s: '[%s]' % s, fatal=False)              if player:                  video = player[-1] +                if video['videoUrl'] in ('http', 'https'): +                    return self.url_result(video['url'], MedialaanIE.ie_key())                  info = {                      'id': video_id,                      'url': video['videoUrl'], diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py new file mode 100644 index 000000000..9760eafd5 --- /dev/null +++ b/youtube_dl/extractor/mediaset.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    parse_duration, +    try_get, +    unified_strdate, +) + + +class MediasetIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    (?: +                        mediaset:| +                        https?:// +                            (?:www\.)?video\.mediaset\.it/ +                            (?: +                                (?:video|on-demand)/(?:[^/]+/)+[^/]+_| +                                player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= +                            ) +                    )(?P<id>[0-9]+) +                    ''' +    _TESTS = [{ +        # full episode +        'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', +        'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', +        'info_dict': { +            'id': '661824', +            'ext': 'mp4', +            'title': 'Quarta puntata', +            'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 1414, +            'creator': 'mediaset', +            'upload_date': '20161107', +            'series': 'Hello Goodbye', +            'categories': ['reality'], +        }, +        'expected_warnings': ['is not a supported codec'], +    }, { +        # clip +        'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', +        'only_matching': True, +    }, { +        # iframe simple +        'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', +        'only_matching': True, +    }, { +        # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) +        'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', +        'only_matching': True, +    }, { +        'url': 'mediaset:661824', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_urls(webpage): +        return [ +            mobj.group('url') +            for mobj in re.finditer( +                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', +                webpage)] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        video_list = self._download_json( +            'http://cdnsel01.mediaset.net/GetCdn.aspx', +            video_id, 'Downloading video CDN JSON', query={ +                'streamid': video_id, +                'format': 'json', +            })['videoList'] + +        formats = [] +        for format_url in video_list: +            if '.ism' in format_url: +                formats.extend(self._extract_ism_formats( +                    format_url, video_id, ism_id='mss', fatal=False)) +            else: +                formats.append({ +                    'url': format_url, +                    'format_id': determine_ext(format_url), +                }) +        self._sort_formats(formats) + +        mediainfo = self._download_json( +            'http://plr.video.mediaset.it/html/metainfo.sjson', +            video_id, 'Downloading video info JSON', query={ +                'id': video_id, +            })['video'] + +        title = mediainfo['title'] + +        creator = try_get( +            mediainfo, lambda x: x['brand-info']['publisher'], compat_str) +        category = try_get( +            mediainfo, lambda x: x['brand-info']['category'], compat_str) +        categories = [category] if category else None + +        return { +            'id': video_id, +            'title': title, +            'description': mediainfo.get('short-description'), +            'thumbnail': mediainfo.get('thumbnail'), +            'duration': parse_duration(mediainfo.get('duration')), +            'creator': creator, +            'upload_date': unified_strdate(mediainfo.get('production-date')), +            'webpage_url': mediainfo.get('url'), +            'series': mediainfo.get('brand-value'), +            'categories': categories, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py new file mode 100644 index 000000000..60e3caf0d --- /dev/null +++ b/youtube_dl/extractor/megaphone.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class MegaphoneIE(InfoExtractor): +    IE_NAME = 'megaphone.fm' +    IE_DESC = 'megaphone.fm embedded players' +    _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' +    _TEST = { +        'url': 'https://player.megaphone.fm/GLT9749789991?"', +        'md5': '4816a0de523eb3e972dc0dda2c191f96', +        'info_dict': { +            'id': 'GLT9749789991', +            'ext': 'mp3', +            'title': '#97 What Kind Of Idiot Gets Phished?', +            'thumbnail': 're:^https://.*\.png.*$', +            'duration': 1776.26375, +            'author': 'Reply All', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_property('audio:title', webpage) +        author = self._og_search_property('audio:artist', webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON') +        episode_data = self._parse_json(episode_json, video_id, js_to_json) +        video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:') + +        formats = [{ +            'url': video_url, +        }] + +        return { +            'id': video_id, +            'thumbnail': thumbnail, +            'title': title, +            'author': author, +            'duration': episode_data['duration'], +            'formats': formats, +        } + +    @classmethod +    def _extract_urls(cls, webpage): +        return [m[0] for m in re.findall( +            r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 28b743cca..964dc542c 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor):              video_id, 'Downloading gigya script')          # Get a appKey/uuid for getting the session key -        appKey_var = self._search_regex( -            r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)', -            gigya_sc, 'appKey variable')          appKey = self._search_regex( -            r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey') +            r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', +            gigya_sc, 'appKey')          session_json = self._download_json(              'https://appgrid-api.cloud.accedo.tv/session', diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 0efbe660a..f6360cce6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor  from ..compat import (      compat_chr,      compat_ord, +    compat_str,      compat_urllib_parse_unquote,      compat_urlparse,  ) @@ -53,16 +54,27 @@ class MixcloudIE(InfoExtractor):          'only_matching': True,      }] -    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js -    @staticmethod -    def _decrypt_play_info(play_info): -        KEY = 'pleasedontdownloadourmusictheartistswontgetpaid' +    _keys = [ +        'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', +        'pleasedontdownloadourmusictheartistswontgetpaid', +        'window.addEventListener = window.addEventListener || function() {};', +        '(function() { return new Date().toLocaleDateString(); })()' +    ] +    _current_key = None +    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js +    def _decrypt_play_info(self, play_info, video_id):          play_info = base64.b64decode(play_info.encode('ascii')) - -        return ''.join([ -            compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)])) -            for idx, ch in enumerate(play_info)]) +        for num, key in enumerate(self._keys, start=1): +            try: +                return self._parse_json( +                    ''.join([ +                        compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) +                        for idx, ch in enumerate(play_info)]), +                    video_id) +            except ExtractorError: +                if num == len(self._keys): +                    raise      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -72,14 +84,30 @@ class MixcloudIE(InfoExtractor):          webpage = self._download_webpage(url, track_id) +        if not self._current_key: +            js_url = self._search_regex( +                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', +                webpage, 'js url', default=None) +            if js_url: +                js = self._download_webpage(js_url, track_id, fatal=False) +                if js: +                    KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1' +                    for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): +                        key = self._search_regex( +                            KEY_RE_TEMPLATE % key_name, js, 'key', +                            default=None, group='key') +                        if key and isinstance(key, compat_str): +                            self._keys.insert(0, key) +                            self._current_key = key +          message = self._html_search_regex(              r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',              webpage, 'error message', default=None)          encrypted_play_info = self._search_regex(              r'm-play-info="([^"]+)"', webpage, 'play info') -        play_info = self._parse_json( -            self._decrypt_play_info(encrypted_play_info), track_id) + +        play_info = self._decrypt_play_info(encrypted_play_info, track_id)          if message and 'stream_url' not in play_info:              raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 59cd4b838..675ff6873 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -15,7 +15,7 @@ class MLBIE(InfoExtractor):                          (?:[\da-z_-]+\.)*mlb\.com/                          (?:                              (?: -                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| +                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)|                                  (?:                                      shared/video/embed/(?:embed|m-internal-embed)\.html|                                      (?:[^/]+/)+(?:play|index)\.jsp| @@ -84,7 +84,7 @@ class MLBIE(InfoExtractor):          },          {              'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', -            'md5': 'b190e70141fb9a1552a85426b4da1b5d', +            'md5': 'aafaf5b0186fee8f32f20508092f8111',              'info_dict': {                  'id': '75609783',                  'ext': 'mp4', @@ -95,6 +95,10 @@ class MLBIE(InfoExtractor):              }          },          { +            'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', +            'only_matching': True, +        }, +        {              'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',              'only_matching': True,          }, diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py deleted file mode 100644 index 5a1bee5c8..000000000 --- a/youtube_dl/extractor/mpora.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class MporaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' -    IE_NAME = 'MPORA' - -    _TEST = { -        'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de', -        'md5': 'a7a228473eedd3be741397cf452932eb', -        'info_dict': { -            'id': 'AAdo8okx4wiz', -            'ext': 'mp4', -            'title': 'Katy Curd -  Winter in the Forest', -            'duration': 416, -            'uploader': 'Peter Newman Media', -        }, -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        data_json = self._search_regex( -            [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", -             r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"], -            webpage, 'json') -        data = self._parse_json(data_json, video_id) - -        uploader = data['info_overlay'].get('username') -        duration = data['video']['duration'] // 1000 -        thumbnail = data['video']['encodings']['sd']['poster'] -        title = data['info_overlay']['title'] - -        formats = [] -        for encoding_id, edata in data['video']['encodings'].items(): -            for src in edata['sources']: -                width_str = self._search_regex( -                    r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'], -                    False, default=None) -                vcodec = src['type'].partition('/')[2] - -                formats.append({ -                    'format_id': encoding_id + '-' + vcodec, -                    'url': src['src'], -                    'vcodec': vcodec, -                    'width': int_or_none(width_str), -                }) - -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': title, -            'formats': formats, -            'uploader': uploader, -            'duration': duration, -            'thumbnail': thumbnail, -        } diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 1473bcf48..650731fdc 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -68,10 +68,6 @@ class MSNIE(InfoExtractor):              format_url = file_.get('url')              if not format_url:                  continue -            ext = determine_ext(format_url) -            if ext == 'ism': -                formats.extend(self._extract_ism_formats( -                    format_url + '/Manifest', display_id, 'mss', fatal=False))              if 'm3u8' in format_url:                  # m3u8_native should not be used here until                  # https://github.com/rg3/youtube-dl/issues/9913 is fixed @@ -79,6 +75,9 @@ class MSNIE(InfoExtractor):                      format_url, display_id, 'mp4',                      m3u8_id='hls', fatal=False)                  formats.extend(m3u8_formats) +            elif determine_ext(format_url) == 'ism': +                formats.extend(self._extract_ism_formats( +                    format_url + '/Manifest', display_id, 'mss', fatal=False))              else:                  formats.append({                      'url': format_url, diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8acea1461..25af5ddfd 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -50,8 +50,7 @@ class MTVServicesInfoExtractor(InfoExtractor):          thumb_node = itemdoc.find(search_path)          if thumb_node is None:              return None -        else: -            return thumb_node.attrib['url'] +        return thumb_node.get('url') or thumb_node.text or None      def _extract_mobile_video_formats(self, mtvn_id):          webpage_url = self._MOBILE_TEMPLATE % mtvn_id @@ -83,7 +82,7 @@ class MTVServicesInfoExtractor(InfoExtractor):                  hls_url = rendition.find('./src').text                  formats.extend(self._extract_m3u8_formats(                      hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', -                    m3u8_id='hls')) +                    m3u8_id='hls', fatal=False))              else:                  # fms                  try: @@ -106,7 +105,8 @@ class MTVServicesInfoExtractor(InfoExtractor):                      }])                  except (KeyError, TypeError):                      raise ExtractorError('Invalid rendition field.') -        self._sort_formats(formats) +        if formats: +            self._sort_formats(formats)          return formats      def _extract_subtitles(self, mdoc, mtvn_id): @@ -133,8 +133,11 @@ class MTVServicesInfoExtractor(InfoExtractor):              mediagen_url += 'acceptMethods='              mediagen_url += 'hls' if use_hls else 'fms' -        mediagen_doc = self._download_xml(mediagen_url, video_id, -                                          'Downloading video urls') +        mediagen_doc = self._download_xml( +            mediagen_url, video_id, 'Downloading video urls', fatal=False) + +        if mediagen_doc is False: +            return None          item = mediagen_doc.find('./video/item')          if item is not None and item.get('type') == 'text': @@ -174,6 +177,13 @@ class MTVServicesInfoExtractor(InfoExtractor):          formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) +        # Some parts of complete video may be missing (e.g. missing Act 3 in +        # http://www.southpark.de/alle-episoden/s14e01-sexual-healing) +        if not formats: +            return None + +        self._sort_formats(formats) +          return {              'title': title,              'formats': formats, @@ -205,9 +215,14 @@ class MTVServicesInfoExtractor(InfoExtractor):          title = xpath_text(idoc, './channel/title')          description = xpath_text(idoc, './channel/description') +        entries = [] +        for item in idoc.findall('.//item'): +            info = self._get_video_info(item, use_hls) +            if info: +                entries.append(info) +          return self.playlist_result( -            [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], -            playlist_title=title, playlist_description=description) +            entries, playlist_title=title, playlist_description=description)      def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):          triforce_feed = self._parse_json(self._search_regex( diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index f281238c9..e164d5940 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -12,64 +12,62 @@ from ..utils import (  class MySpaceIE(InfoExtractor): -    _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' +    _VALID_URL = r'''(?x) +                    https?:// +                        myspace\.com/[^/]+/ +                        (?P<mediatype> +                            video/[^/]+/(?P<video_id>\d+)| +                            music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) +                        ) +                    ''' -    _TESTS = [ -        { -            'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', -            'md5': '9c1483c106f4a695c47d2911feed50a7', -            'info_dict': { -                'id': '109594919', -                'ext': 'mp4', -                'title': 'Little Big Town', -                'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', -                'uploader': 'Five Minutes to the Stage', -                'uploader_id': 'fiveminutestothestage', -                'timestamp': 1414108751, -                'upload_date': '20141023', -            }, +    _TESTS = [{ +        'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', +        'md5': '9c1483c106f4a695c47d2911feed50a7', +        'info_dict': { +            'id': '109594919', +            'ext': 'mp4', +            'title': 'Little Big Town', +            'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', +            'uploader': 'Five Minutes to the Stage', +            'uploader_id': 'fiveminutestothestage', +            'timestamp': 1414108751, +            'upload_date': '20141023',          }, +    }, {          # songs -        { -            'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', -            'md5': '1d7ee4604a3da226dd69a123f748b262', -            'info_dict': { -                'id': '93388656', -                'ext': 'm4a', -                'title': 'Of weakened soul...', -                'uploader': 'Killsorrow', -                'uploader_id': 'killsorrow', -            }, -        }, { -            'add_ie': ['Youtube'], -            'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', -            'info_dict': { -                'id': 'xqds0B_meys', -                'ext': 'webm', -                'title': 'Three Days Grace - Animal I Have Become', -                'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', -                'uploader': 'ThreeDaysGraceVEVO', -                'uploader_id': 'ThreeDaysGraceVEVO', -                'upload_date': '20091002', -            }, -        }, { -            'add_ie': ['Youtube'], -            'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', -            'info_dict': { -                'id': 'ypWvQgnJrSU', -                'ext': 'mp4', -                'title': 'Starset - First Light', -                'description': 'md5:2d5db6c9d11d527683bcda818d332414', -                'uploader': 'Yumi K', -                'uploader_id': 'SorenPromotions', -                'upload_date': '20140725', -            } +        'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', +        'md5': '1d7ee4604a3da226dd69a123f748b262', +        'info_dict': { +            'id': '93388656', +            'ext': 'm4a', +            'title': 'Of weakened soul...', +            'uploader': 'Killsorrow', +            'uploader_id': 'killsorrow',          }, -    ] +    }, { +        'add_ie': ['Youtube'], +        'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', +        'info_dict': { +            'id': 'xqds0B_meys', +            'ext': 'webm', +            'title': 'Three Days Grace - Animal I Have Become', +            'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', +            'uploader': 'ThreeDaysGraceVEVO', +            'uploader_id': 'ThreeDaysGraceVEVO', +            'upload_date': '20091002', +        }, +    }, { +        'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', +        'only_matching': True, +    }, { +        'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', +        'only_matching': True, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = mobj.group('video_id') or mobj.group('song_id')          is_song = mobj.group('mediatype').startswith('music/song')          webpage = self._download_webpage(url, video_id)          player_url = self._search_regex( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2a44d05d..62db70b43 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -5,10 +5,8 @@ import re  from .common import InfoExtractor  from .theplatform import ThePlatformIE  from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_urlparse  from ..utils import (      find_xpath_attr, -    lowercase_escape,      smuggle_url,      unescapeHTML,      update_url_query, @@ -17,7 +15,7 @@ from ..utils import (  class NBCIE(AdobePassIE): -    _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' +    _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))'      _TESTS = [          { @@ -37,16 +35,6 @@ class NBCIE(AdobePassIE):              },          },          { -            'url': 'http://www.nbc.com/the-tonight-show/episodes/176', -            'info_dict': { -                'id': '176', -                'ext': 'flv', -                'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', -                'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', -            }, -            'skip': '404 Not Found', -        }, -        {              'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',              'info_dict': {                  'id': '2832821', @@ -64,11 +52,6 @@ class NBCIE(AdobePassIE):              'skip': 'Only works from US',          },          { -            # This video has expired but with an escaped embedURL -            'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', -            'only_matching': True, -        }, -        {              # HLS streams requires the 'hdnea3' cookie              'url': 'http://www.nbc.com/Kings/video/goliath/n1806',              'info_dict': { @@ -88,59 +71,38 @@ class NBCIE(AdobePassIE):      ]      def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        info = { +        permalink, video_id = re.match(self._VALID_URL, url).groups() +        video_data = self._download_json( +            'https://api.nbc.com/v3/videos', video_id, query={ +                'filter[permalink]': permalink, +            })['data'][0]['attributes'] +        query = { +            'mbr': 'true', +            'manifest': 'm3u', +        } +        video_id = video_data['guid'] +        title = video_data['title'] +        if video_data.get('entitlement') == 'auth': +            resource = self._get_mvpd_resource( +                'nbcentertainment', title, video_id, +                video_data.get('vChipRating')) +            query['auth'] = self._extract_mvpd_auth( +                url, video_id, 'nbcentertainment', resource) +        theplatform_url = smuggle_url(update_url_query( +            'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, +            query), {'force_smil_url': True}) +        return {              '_type': 'url_transparent', -            'ie_key': 'ThePlatform',              'id': video_id, +            'title': title, +            'url': theplatform_url, +            'description': video_data.get('description'), +            'keywords': video_data.get('keywords'), +            'season_number': int_or_none(video_data.get('seasonNumber')), +            'episode_number': int_or_none(video_data.get('episodeNumber')), +            'series': video_data.get('showName'), +            'ie_key': 'ThePlatform',          } -        video_data = None -        preload = self._search_regex( -            r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None) -        if preload: -            preload_data = self._parse_json(preload, video_id) -            path = compat_urllib_parse_urlparse(url).path.rstrip('/') -            entity_id = preload_data.get('xref', {}).get(path) -            video_data = preload_data.get('entities', {}).get(entity_id) -        if video_data: -            query = { -                'mbr': 'true', -                'manifest': 'm3u', -            } -            video_id = video_data['guid'] -            title = video_data['title'] -            if video_data.get('entitlement') == 'auth': -                resource = self._get_mvpd_resource( -                    'nbcentertainment', title, video_id, -                    video_data.get('vChipRating')) -                query['auth'] = self._extract_mvpd_auth( -                    url, video_id, 'nbcentertainment', resource) -            theplatform_url = smuggle_url(update_url_query( -                'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, -                query), {'force_smil_url': True}) -            info.update({ -                'id': video_id, -                'title': title, -                'url': theplatform_url, -                'description': video_data.get('description'), -                'keywords': video_data.get('keywords'), -                'season_number': int_or_none(video_data.get('seasonNumber')), -                'episode_number': int_or_none(video_data.get('episodeNumber')), -                'series': video_data.get('showName'), -            }) -        else: -            theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( -                [ -                    r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', -                    r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', -                    r'"embedURL"\s*:\s*"([^"]+)"' -                ], -                webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) -            if theplatform_url.startswith('//'): -                theplatform_url = 'http:' + theplatform_url -            info['url'] = smuggle_url(theplatform_url, {'source_url': url}) -        return info  class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 9bea610c8..0e26f8399 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -1,6 +1,15 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..utils import ( +    extract_attributes, +    int_or_none, +    parse_duration, +    parse_filesize, +    unified_timestamp, +)  class NewgroundsIE(InfoExtractor): @@ -13,7 +22,10 @@ class NewgroundsIE(InfoExtractor):              'ext': 'mp3',              'title': 'B7 - BusMode',              'uploader': 'Burn7', -        } +            'timestamp': 1378878540, +            'upload_date': '20130911', +            'duration': 143, +        },      }, {          'url': 'https://www.newgrounds.com/portal/view/673111',          'md5': '3394735822aab2478c31b1004fe5e5bc', @@ -22,25 +34,133 @@ class NewgroundsIE(InfoExtractor):              'ext': 'mp4',              'title': 'Dancin',              'uploader': 'Squirrelman82', +            'timestamp': 1460256780, +            'upload_date': '20160410', +        }, +    }, { +        # source format unavailable, additional mp4 formats +        'url': 'http://www.newgrounds.com/portal/view/689400', +        'info_dict': { +            'id': '689400', +            'ext': 'mp4', +            'title': 'ZTV News Episode 8', +            'uploader': 'BennettTheSage', +            'timestamp': 1487965140, +            'upload_date': '20170224', +        }, +        'params': { +            'skip_download': True,          },      }]      def _real_extract(self, url):          media_id = self._match_id(url) +          webpage = self._download_webpage(url, media_id)          title = self._html_search_regex(              r'<title>([^>]+)</title>', webpage, 'title') -        uploader = self._html_search_regex( -            r'Author\s*<a[^>]+>([^<]+)', webpage, 'uploader', fatal=False) +        media_url = self._parse_json(self._search_regex( +            r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) + +        formats = [{ +            'url': media_url, +            'format_id': 'source', +            'quality': 1, +        }] + +        max_resolution = int_or_none(self._search_regex( +            r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', +            default=None)) +        if max_resolution: +            url_base = media_url.rpartition('.')[0] +            for resolution in (360, 720, 1080): +                if resolution > max_resolution: +                    break +                formats.append({ +                    'url': '%s.%dp.mp4' % (url_base, resolution), +                    'format_id': '%dp' % resolution, +                    'height': resolution, +                }) + +        self._check_formats(formats, media_id) +        self._sort_formats(formats) -        music_url = self._parse_json(self._search_regex( -            r'"url":("[^"]+"),', webpage, ''), media_id) +        uploader = self._search_regex( +            r'(?:Author|Writer)\s*<a[^>]+>([^<]+)', webpage, 'uploader', +            fatal=False) + +        timestamp = unified_timestamp(self._search_regex( +            r'<dt>Uploaded</dt>\s*<dd>([^<]+)', webpage, 'timestamp', +            default=None)) +        duration = parse_duration(self._search_regex( +            r'<dd>Song\s*</dd><dd>.+?</dd><dd>([^<]+)', webpage, 'duration', +            default=None)) + +        filesize_approx = parse_filesize(self._html_search_regex( +            r'<dd>Song\s*</dd><dd>(.+?)</dd>', webpage, 'filesize', +            default=None)) +        if len(formats) == 1: +            formats[0]['filesize_approx'] = filesize_approx + +        if '<dd>Song' in webpage: +            formats[0]['vcodec'] = 'none'          return {              'id': media_id,              'title': title, -            'url': music_url,              'uploader': uploader, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats,          } + + +class NewgroundsPlaylistIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://www.newgrounds.com/collection/cats', +        'info_dict': { +            'id': 'cats', +            'title': 'Cats', +        }, +        'playlist_mincount': 46, +    }, { +        'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA', +        'info_dict': { +            'id': 'ZONE-SAMA', +            'title': 'Portal Search: ZONE-SAMA', +        }, +        'playlist_mincount': 47, +    }, { +        'url': 'http://www.newgrounds.com/audio/search/title/cats', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        title = self._search_regex( +            r'<title>([^>]+)</title>', webpage, 'title', default=None) + +        # cut left menu +        webpage = self._search_regex( +            r'(?s)<div[^>]+\bclass=["\']column wide(.+)', +            webpage, 'wide column', default=webpage) + +        entries = [] +        for a, path, media_id in re.findall( +                r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)', +                webpage): +            a_class = extract_attributes(a).get('class') +            if a_class not in ('item-portalsubmission', 'item-audiosubmission'): +                continue +            entries.append( +                self.url_result( +                    'https://www.newgrounds.com/%s' % path, +                    ie=NewgroundsIE.ie_key(), video_id=media_id)) + +        return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py new file mode 100644 index 000000000..d0235fdfe --- /dev/null +++ b/youtube_dl/extractor/nexx.py @@ -0,0 +1,271 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import random +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_duration, +    try_get, +    urlencode_postdata, +) + + +class NexxIE(InfoExtractor): +    _VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/(?P<id>\d+)' +    _TESTS = [{ +        # movie +        'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', +        'md5': '16746bfc28c42049492385c989b26c4a', +        'info_dict': { +            'id': '128907', +            'ext': 'mp4', +            'title': 'Stiftung Warentest', +            'alt_title': 'Wie ein Test abläuft', +            'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', +            'release_year': 2013, +            'creator': 'SPIEGEL TV', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 2509, +            'timestamp': 1384264416, +            'upload_date': '20131112', +        }, +        'params': { +            'format': 'bestvideo', +        }, +    }, { +        # episode +        'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', +        'info_dict': { +            'id': '247858', +            'ext': 'mp4', +            'title': 'Return of the Golden Child (OV)', +            'description': 'md5:5d969537509a92b733de21bae249dc63', +            'release_year': 2017, +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 1397, +            'timestamp': 1495033267, +            'upload_date': '20170517', +            'episode_number': 2, +            'season_number': 2, +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        }, +    }, { +        'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_urls(webpage): +        # Reference: +        # 1. https://nx-s.akamaized.net/files/201510/44.pdf + +        entries = [] + +        # JavaScript Integration +        mobj = re.search( +            r'<script\b[^>]+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)', +            webpage) +        if mobj: +            domain_id = mobj.group('id') +            for video_id in re.findall( +                    r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', +                    webpage): +                entries.append( +                    'https://api.nexx.cloud/v3/%s/videos/byid/%s' +                    % (domain_id, video_id)) + +        # TODO: support more embed formats + +        return entries + +    @staticmethod +    def _extract_url(webpage): +        return NexxIE._extract_urls(webpage)[0] + +    def _handle_error(self, response): +        status = int_or_none(try_get( +            response, lambda x: x['metadata']['status']) or 200) +        if 200 <= status < 300: +            return +        raise ExtractorError( +            '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']), +            expected=True) + +    def _call_api(self, domain_id, path, video_id, data=None, headers={}): +        headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' +        result = self._download_json( +            'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id, +            'Downloading %s JSON' % path, data=urlencode_postdata(data), +            headers=headers) +        self._handle_error(result) +        return result['result'] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        domain_id, video_id = mobj.group('domain_id', 'id') + +        # Reverse engineered from JS code (see getDeviceID function) +        device_id = '%d:%d:%d%d' % ( +            random.randint(1, 4), int(time.time()), +            random.randint(1e4, 99999), random.randint(1, 9)) + +        result = self._call_api(domain_id, 'session/init', video_id, data={ +            'nxp_devh': device_id, +            'nxp_userh': '', +            'precid': '0', +            'playlicense': '0', +            'screenx': '1920', +            'screeny': '1080', +            'playerversion': '6.0.00', +            'gateway': 'html5', +            'adGateway': '', +            'explicitlanguage': 'en-US', +            'addTextTemplates': '1', +            'addDomainData': '1', +            'addAdModel': '1', +        }, headers={ +            'X-Request-Enable-Auth-Fallback': '1', +        }) + +        cid = result['general']['cid'] + +        # As described in [1] X-Request-Token generation algorithm is +        # as follows: +        #   md5( operation + domain_id + domain_secret ) +        # where domain_secret is a static value that will be given by nexx.tv +        # as per [1]. Here is how this "secret" is generated (reversed +        # from _play.api.init function, search for clienttoken). So it's +        # actually not static and not that much of a secret. +        # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf +        secret = result['device']['clienttoken'][int(device_id[0]):] +        secret = secret[0:len(secret) - int(device_id[-1])] + +        op = 'byid' + +        # Reversed from JS code for _play.api.call function (search for +        # X-Request-Token) +        request_token = hashlib.md5( +            ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + +        video = self._call_api( +            domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ +                'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', +                'addInteractionOptions': '1', +                'addStatusDetails': '1', +                'addStreamDetails': '1', +                'addCaptions': '1', +                'addScenes': '1', +                'addHotSpots': '1', +                'addBumpers': '1', +                'captionFormat': 'data', +            }, headers={ +                'X-Request-CID': cid, +                'X-Request-Token': request_token, +            }) + +        general = video['general'] +        title = general['title'] + +        stream_data = video['streamdata'] +        language = general.get('language_raw') or '' + +        # TODO: reverse more cdns and formats + +        cdn = stream_data['cdnType'] +        assert cdn == 'azure' + +        azure_locator = stream_data['azureLocator'] + +        AZURE_URL = 'http://nx-p%02d.akamaized.net/' + +        for secure in ('s', ''): +            cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper()) +            if cdn_shield: +                azure_base = 'http%s://%s' % (secure, cdn_shield) +                break +        else: +            azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', '')) + +        is_ml = ',' in language +        azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % ( +            azure_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + +        protection_token = try_get( +            video, lambda x: x['protectiondata']['token'], compat_str) +        if protection_token: +            azure_m3u8_url += '?hdnts=%s' % protection_token + +        formats = self._extract_m3u8_formats( +            azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', +            m3u8_id='%s-hls' % cdn) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'alt_title': general.get('subtitle'), +            'description': general.get('description'), +            'release_year': int_or_none(general.get('year')), +            'creator': general.get('studio') or general.get('studio_adref'), +            'thumbnail': try_get( +                video, lambda x: x['imagedata']['thumb'], compat_str), +            'duration': parse_duration(general.get('runtime')), +            'timestamp': int_or_none(general.get('uploaded')), +            'episode_number': int_or_none(try_get( +                video, lambda x: x['episodedata']['episode'])), +            'season_number': int_or_none(try_get( +                video, lambda x: x['episodedata']['season'])), +            'formats': formats, +        } + + +class NexxEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', +        'md5': '16746bfc28c42049492385c989b26c4a', +        'info_dict': { +            'id': '161464', +            'ext': 'mp4', +            'title': 'Nervenkitzel Achterbahn', +            'alt_title': 'Karussellbauer in Deutschland', +            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', +            'release_year': 2005, +            'creator': 'SPIEGEL TV', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 2761, +            'timestamp': 1394021479, +            'upload_date': '20140305', +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        }, +    } + +    @staticmethod +    def _extract_urls(webpage): +        # Reference: +        # 1. https://nx-s.akamaized.net/files/201510/44.pdf + +        # iFrame Embed Integration +        return [mobj.group('url') for mobj in re.finditer( +            r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', +            webpage)] + +    def _real_extract(self, url): +        embed_id = self._match_id(url) + +        webpage = self._download_webpage(url, embed_id) + +        return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 08a75929e..510b1c41f 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -12,6 +12,7 @@ class NickIE(MTVServicesInfoExtractor):      IE_NAME = 'nick.com'      _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'      _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' +    _GEO_COUNTRIES = ['US']      _TESTS = [{          'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',          'playlist': [ @@ -74,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor):  class NickDeIE(MTVServicesInfoExtractor):      IE_NAME = 'nick.de' -    _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' +    _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',          'only_matching': True, @@ -87,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor):      }, {          'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht',          'only_matching': True, +    }, { +        'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', +        'only_matching': True,      }]      def _extract_mrss_url(self, webpage, host): @@ -124,3 +128,21 @@ class NickNightIE(NickDeIE):          return self._search_regex(              r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage,              'mrss url', group='url') + + +class NickRuIE(MTVServicesInfoExtractor): +    IE_NAME = 'nickelodeonru' +    _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', +        'only_matching': True, +    }, { +        'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        mgid = self._extract_mgid(webpage) +        return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 8baac23e4..026329d3e 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -1,23 +1,27 @@  # coding: utf-8  from __future__ import unicode_literals -import re  import json  import datetime  from .common import InfoExtractor  from ..compat import ( +    compat_parse_qs,      compat_urlparse,  )  from ..utils import ( +    determine_ext, +    dict_get,      ExtractorError,      int_or_none, +    float_or_none,      parse_duration,      parse_iso8601, -    sanitized_Request, -    xpath_text, -    determine_ext, +    remove_start, +    try_get, +    unified_timestamp,      urlencode_postdata, +    xpath_text,  ) @@ -32,12 +36,15 @@ class NiconicoIE(InfoExtractor):              'id': 'sm22312215',              'ext': 'mp4',              'title': 'Big Buck Bunny', +            'thumbnail': r're:https?://.*',              'uploader': 'takuya0301',              'uploader_id': '2698420',              'upload_date': '20131123',              'timestamp': 1385182762,              'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',              'duration': 33, +            'view_count': int, +            'comment_count': int,          },          'skip': 'Requires an account',      }, { @@ -49,6 +56,7 @@ class NiconicoIE(InfoExtractor):              'ext': 'swf',              'title': '【鏡音リン】Dance on media【オリジナル】take2!',              'description': 'md5:689f066d74610b3b22e0f1739add0f58', +            'thumbnail': r're:https?://.*',              'uploader': 'りょうた',              'uploader_id': '18822557',              'upload_date': '20110429', @@ -65,9 +73,11 @@ class NiconicoIE(InfoExtractor):              'ext': 'unknown_video',              'description': 'deleted',              'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', +            'thumbnail': r're:https?://.*',              'upload_date': '20071224',              'timestamp': int,  # timestamp field has different value if logged in              'duration': 304, +            'view_count': int,          },          'skip': 'Requires an account',      }, { @@ -77,15 +87,57 @@ class NiconicoIE(InfoExtractor):              'ext': 'mp4',              'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',              'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', +            'thumbnail': r're:https?://.*',              'timestamp': 1388851200,              'upload_date': '20140104',              'uploader': 'アニメロチャンネル',              'uploader_id': '312',          },          'skip': 'The viewing period of the video you were searching for has expired.', +    }, { +        # video not available via `getflv`; "old" HTML5 video +        'url': 'http://www.nicovideo.jp/watch/sm1151009', +        'md5': '8fa81c364eb619d4085354eab075598a', +        'info_dict': { +            'id': 'sm1151009', +            'ext': 'mp4', +            'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', +            'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', +            'thumbnail': r're:https?://.*', +            'duration': 184, +            'timestamp': 1190868283, +            'upload_date': '20070927', +            'uploader': 'denden2', +            'uploader_id': '1392194', +            'view_count': int, +            'comment_count': int, +        }, +        'skip': 'Requires an account', +    }, { +        # "New" HTML5 video +        'url': 'http://www.nicovideo.jp/watch/sm31464864', +        'md5': '351647b4917660986dc0fa8864085135', +        'info_dict': { +            'id': 'sm31464864', +            'ext': 'mp4', +            'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', +            'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', +            'timestamp': 1498514060, +            'upload_date': '20170626', +            'uploader': 'ゲス', +            'uploader_id': '40826363', +            'thumbnail': r're:https?://.*', +            'duration': 198, +            'view_count': int, +            'comment_count': int, +        }, +        'skip': 'Requires an account', +    }, { +        'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', +        'only_matching': True,      }] -    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' +    _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico'      def _real_initialize(self): @@ -98,19 +150,102 @@ class NiconicoIE(InfoExtractor):              return True          # Log in +        login_ok = True          login_form_strs = { -            'mail': username, +            'mail_tel': username,              'password': password,          } -        login_data = urlencode_postdata(login_form_strs) -        request = sanitized_Request( -            'https://secure.nicovideo.jp/secure/login', login_data) -        login_results = self._download_webpage( -            request, None, note='Logging in', errnote='Unable to log in') -        if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: +        urlh = self._request_webpage( +            'https://account.nicovideo.jp/api/v1/login', None, +            note='Logging in', errnote='Unable to log in', +            data=urlencode_postdata(login_form_strs)) +        if urlh is False: +            login_ok = False +        else: +            parts = compat_urlparse.urlparse(urlh.geturl()) +            if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': +                login_ok = False +        if not login_ok:              self._downloader.report_warning('unable to log in: bad username or password') -            return False -        return True +        return login_ok + +    def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): +        def yesno(boolean): +            return 'yes' if boolean else 'no' + +        session_api_data = api_data['video']['dmcInfo']['session_api'] +        session_api_endpoint = session_api_data['urls'][0] + +        format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + +        session_response = self._download_json( +            session_api_endpoint['url'], video_id, +            query={'_format': 'json'}, +            headers={'Content-Type': 'application/json'}, +            note='Downloading JSON metadata for %s' % format_id, +            data=json.dumps({ +                'session': { +                    'client_info': { +                        'player_id': session_api_data['player_id'], +                    }, +                    'content_auth': { +                        'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], +                        'content_key_timeout': session_api_data['content_key_timeout'], +                        'service_id': 'nicovideo', +                        'service_user_id': session_api_data['service_user_id'] +                    }, +                    'content_id': session_api_data['content_id'], +                    'content_src_id_sets': [{ +                        'content_src_ids': [{ +                            'src_id_to_mux': { +                                'audio_src_ids': [audio_quality['id']], +                                'video_src_ids': [video_quality['id']], +                            } +                        }] +                    }], +                    'content_type': 'movie', +                    'content_uri': '', +                    'keep_method': { +                        'heartbeat': { +                            'lifetime': session_api_data['heartbeat_lifetime'] +                        } +                    }, +                    'priority': session_api_data['priority'], +                    'protocol': { +                        'name': 'http', +                        'parameters': { +                            'http_parameters': { +                                'parameters': { +                                    'http_output_download_parameters': { +                                        'use_ssl': yesno(session_api_endpoint['is_ssl']), +                                        'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), +                                    } +                                } +                            } +                        } +                    }, +                    'recipe_id': session_api_data['recipe_id'], +                    'session_operation_auth': { +                        'session_operation_auth_by_signature': { +                            'signature': session_api_data['signature'], +                            'token': session_api_data['token'], +                        } +                    }, +                    'timing_constraint': 'unlimited' +                } +            })) + +        resolution = video_quality.get('resolution', {}) + +        return { +            'url': session_response['data']['session']['content_uri'], +            'format_id': format_id, +            'ext': 'mp4',  # Session API are used in HTML5, which always serves mp4 +            'abr': float_or_none(audio_quality.get('bitrate'), 1000), +            'vbr': float_or_none(video_quality.get('bitrate'), 1000), +            'height': resolution.get('height'), +            'width': resolution.get('width'), +        }      def _real_extract(self, url):          video_id = self._match_id(url) @@ -123,30 +258,84 @@ class NiconicoIE(InfoExtractor):          if video_id.startswith('so'):              video_id = self._match_id(handle.geturl()) -        video_info = self._download_xml( -            'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, -            note='Downloading video info page') - -        # Get flv info -        flv_info_webpage = self._download_webpage( -            'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', -            video_id, 'Downloading flv info') - -        flv_info = compat_urlparse.parse_qs(flv_info_webpage) -        if 'url' not in flv_info: -            if 'deleted' in flv_info: -                raise ExtractorError('The video has been deleted.', -                                     expected=True) -            elif 'closed' in flv_info: -                raise ExtractorError('Niconico videos now require logging in', -                                     expected=True) -            else: -                raise ExtractorError('Unable to find video URL') - -        video_real_url = flv_info['url'][0] +        api_data = self._parse_json(self._html_search_regex( +            'data-api-data="([^"]+)"', webpage, +            'API data', default='{}'), video_id) + +        def _format_id_from_url(video_url): +            return 'economy' if video_real_url.endswith('low') else 'normal' + +        try: +            video_real_url = api_data['video']['smileInfo']['url'] +        except KeyError:  # Flash videos +            # Get flv info +            flv_info_webpage = self._download_webpage( +                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', +                video_id, 'Downloading flv info') + +            flv_info = compat_urlparse.parse_qs(flv_info_webpage) +            if 'url' not in flv_info: +                if 'deleted' in flv_info: +                    raise ExtractorError('The video has been deleted.', +                                         expected=True) +                elif 'closed' in flv_info: +                    raise ExtractorError('Niconico videos now require logging in', +                                         expected=True) +                elif 'error' in flv_info: +                    raise ExtractorError('%s reports error: %s' % ( +                        self.IE_NAME, flv_info['error'][0]), expected=True) +                else: +                    raise ExtractorError('Unable to find video URL') + +            video_info_xml = self._download_xml( +                'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, +                video_id, note='Downloading video info page') + +            def get_video_info(items): +                if not isinstance(items, list): +                    items = [items] +                for item in items: +                    ret = xpath_text(video_info_xml, './/' + item) +                    if ret: +                        return ret + +            video_real_url = flv_info['url'][0] + +            extension = get_video_info('movie_type') +            if not extension: +                extension = determine_ext(video_real_url) + +            formats = [{ +                'url': video_real_url, +                'ext': extension, +                'format_id': _format_id_from_url(video_real_url), +            }] +        else: +            formats = [] + +            dmc_info = api_data['video'].get('dmcInfo') +            if dmc_info:  # "New" HTML5 videos +                quality_info = dmc_info['quality'] +                for audio_quality in quality_info['audios']: +                    for video_quality in quality_info['videos']: +                        if not audio_quality['available'] or not video_quality['available']: +                            continue +                        formats.append(self._extract_format_for_quality( +                            api_data, video_id, audio_quality, video_quality)) + +                self._sort_formats(formats) +            else:  # "Old" HTML5 videos +                formats = [{ +                    'url': video_real_url, +                    'ext': 'mp4', +                    'format_id': _format_id_from_url(video_real_url), +                }] + +            def get_video_info(items): +                return dict_get(api_data['video'], items)          # Start extracting information -        title = xpath_text(video_info, './/title') +        title = get_video_info('title')          if not title:              title = self._og_search_title(webpage, default=None)          if not title: @@ -160,18 +349,15 @@ class NiconicoIE(InfoExtractor):          watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}          video_detail = watch_api_data.get('videoDetail', {}) -        extension = xpath_text(video_info, './/movie_type') -        if not extension: -            extension = determine_ext(video_real_url) -          thumbnail = ( -            xpath_text(video_info, './/thumbnail_url') or +            get_video_info(['thumbnail_url', 'thumbnailURL']) or              self._html_search_meta('image', webpage, 'thumbnail', default=None) or              video_detail.get('thumbnail')) -        description = xpath_text(video_info, './/description') +        description = get_video_info('description') -        timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve')) +        timestamp = (parse_iso8601(get_video_info('first_retrieve')) or +                     unified_timestamp(get_video_info('postedDateTime')))          if not timestamp:              match = self._html_search_meta('datePublished', webpage, 'date published', default=None)              if match: @@ -181,7 +367,7 @@ class NiconicoIE(InfoExtractor):                  video_detail['postedAt'].replace('/', '-'),                  delimiter=' ', timezone=datetime.timedelta(hours=9)) -        view_count = int_or_none(xpath_text(video_info, './/view_counter')) +        view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))          if not view_count:              match = self._html_search_regex(                  r'>Views: <strong[^>]*>([^<]+)</strong>', @@ -190,38 +376,33 @@ class NiconicoIE(InfoExtractor):                  view_count = int_or_none(match.replace(',', ''))          view_count = view_count or video_detail.get('viewCount') -        comment_count = int_or_none(xpath_text(video_info, './/comment_num')) +        comment_count = (int_or_none(get_video_info('comment_num')) or +                         video_detail.get('commentCount') or +                         try_get(api_data, lambda x: x['thread']['commentCount']))          if not comment_count:              match = self._html_search_regex(                  r'>Comments: <strong[^>]*>([^<]+)</strong>',                  webpage, 'comment count', default=None)              if match:                  comment_count = int_or_none(match.replace(',', '')) -        comment_count = comment_count or video_detail.get('commentCount')          duration = (parse_duration( -            xpath_text(video_info, './/length') or +            get_video_info('length') or              self._html_search_meta(                  'video:duration', webpage, 'video duration', default=None)) or -            video_detail.get('length')) +            video_detail.get('length') or +            get_video_info('duration')) -        webpage_url = xpath_text(video_info, './/watch_url') or url +        webpage_url = get_video_info('watch_url') or url -        if video_info.find('.//ch_id') is not None: -            uploader_id = video_info.find('.//ch_id').text -            uploader = video_info.find('.//ch_name').text -        elif video_info.find('.//user_id') is not None: -            uploader_id = video_info.find('.//user_id').text -            uploader = video_info.find('.//user_nickname').text -        else: -            uploader_id = uploader = None +        owner = api_data.get('owner', {}) +        uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') +        uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')          return {              'id': video_id, -            'url': video_real_url,              'title': title, -            'ext': extension, -            'format_id': 'economy' if video_real_url.endswith('low') else 'normal', +            'formats': formats,              'thumbnail': thumbnail,              'description': description,              'uploader': uploader, diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index f5e3f6815..9b5ad5a9f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,6 +6,7 @@ import re  from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import ( +    extract_attributes,      get_element_by_class,      urlencode_postdata,  ) @@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          formats = [] -        for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage): -            player_url = compat_urlparse.urljoin(url, player_url) - +        for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): +            player = extract_attributes(mobj.group(0)) +            player_path = player.get('href') +            if not player_path: +                continue +            kind = self._search_regex( +                r'(low|high)$', player.get('class') or '', 'kind', +                default='low') +            player_url = compat_urlparse.urljoin(url, player_path)              player_page = self._download_webpage(                  player_url, video_id, note='Downloading player page') -              entries = self._parse_html5_media_entries(                  player_url, player_page, video_id, m3u8_id='hls-%s' % kind, -                m3u8_entry_protocol='m3u8_native', -                preference=2 if 'hq' in kind else 1) -            formats.extend(entries[0]['formats']) +                m3u8_entry_protocol='m3u8_native') +            kind_formats = entries[0]['formats'] +            for f in kind_formats: +                f['quality'] = 2 if kind == 'high' else 1 +            formats.extend(kind_formats)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py new file mode 100644 index 000000000..63e58aae2 --- /dev/null +++ b/youtube_dl/extractor/nonktube.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class NonkTubeIE(NuevoBaseIE): +    _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized', +        'info_dict': { +            'id': '118636', +            'ext': 'mp4', +            'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized', +            'age_limit': 18, +            'duration': 1150.98, +        }, +        'params': { +            'skip_download': True, +        } +    }, { +        'url': 'https://www.nonktube.com/embed/118636', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        info = self._extract_nuevo( +            'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' +            % video_id, video_id) + +        info['age_limit'] = 18 +        return info diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098a5..974de3c3e 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor  from ..compat import compat_str  from ..utils import (      int_or_none, +    js_to_json,      smuggle_url,      try_get,  ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor):              'timestamp': 1491399228,              'upload_date': '20170405',              'uploader_id': '618566855001', -            'creator': 'vtele', -            'view_count': int,              'series': 'RPM+',          },          'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor):          'info_dict': {              'id': '5395865725001',              'title': 'Épisode 13 : Les retrouvailles', -            'description': 'md5:336d5ebc5436534e61d16e63ddfca327', +            'description': 'md5:888c3330f0c1b4476c5bc99a1c040473',              'ext': 'mp4',              'timestamp': 1492019320,              'upload_date': '20170412',              'uploader_id': '618566855001', -            'creator': 'vtele', -            'view_count': int,              'series': "L'amour est dans le pré",              'season_number': 5,              'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        data = self._download_json( -            'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, -            video_id)['data'] +        webpage = self._download_webpage(url, video_id) -        content = try_get(data, lambda x: x['contents'][0]) +        bc_url = BrightcoveNewIE._extract_url(self, webpage) -        brightcove_id = data.get('brightcoveId') or content['brightcoveId'] +        data = self._parse_json( +            self._search_regex( +                r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', +                default='{}'), +            video_id, transform_source=js_to_json, fatal=False) + +        title = try_get( +            data, lambda x: x['video']['nom'], +            compat_str) or self._html_search_meta( +            'dcterms.Title', webpage, 'title', fatal=True) + +        description = self._html_search_meta( +            ('dcterms.Description', 'description'), webpage, 'description')          series = try_get( -            data, ( -                lambda x: x['show']['title'], -                lambda x: x['season']['show']['title']), -            compat_str) +            data, lambda x: x['emission']['nom']) or self._search_regex( +            r'<div[^>]+class="banner-card__subtitle h4"[^>]*>([^<]+)', +            webpage, 'series', default=None) -        episode = None -        og = data.get('og') -        if isinstance(og, dict) and og.get('type') == 'video.episode': -            episode = og.get('title') +        season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} +        season = try_get(season_el, lambda x: x['nom'], compat_str) +        season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) -        video = content or data +        episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} +        episode = try_get(episode_el, lambda x: x['nom'], compat_str) +        episode_number = int_or_none(try_get(episode_el, lambda x: x['numero']))          return {              '_type': 'url_transparent',              'ie_key': BrightcoveNewIE.ie_key(), -            'url': smuggle_url( -                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, -                {'geo_countries': ['CA']}), -            'id': brightcove_id, -            'title': video.get('title'), -            'creator': video.get('source'), -            'view_count': int_or_none(video.get('viewsCount')), +            'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), +            'title': title, +            'description': description,              'series': series, -            'season_number': int_or_none(try_get( -                data, lambda x: x['season']['seasonNumber'])), +            'season': season, +            'season_number': season_number,              'episode': episode, -            'episode_number': int_or_none(data.get('episodeNumber')), +            'episode_number': episode_number,          } diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 79296f0ef..fa4ef20c5 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -28,17 +28,17 @@ class NPOBaseIE(InfoExtractor):  class NPOIE(NPOBaseIE):      IE_NAME = 'npo' -    IE_DESC = 'npo.nl and ntr.nl' +    IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'      _VALID_URL = r'''(?x)                      (?:                          npo:|                          https?://                              (?:www\.)?                              (?: -                                npo\.nl/(?!live|radio)(?:[^/]+/){2}| +                                npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|                                  ntr\.nl/(?:[^/]+/){2,}|                                  omroepwnl\.nl/video/fragment/[^/]+__| -                                zapp\.nl/[^/]+/[^/]+/ +                                (?:zapp|npo3)\.nl/(?:[^/]+/){2}                              )                          )                          (?P<id>[^/?#]+) @@ -147,9 +147,15 @@ class NPOIE(NPOBaseIE):          'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',          'only_matching': True,      }, { +        'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', +        'only_matching': True, +    }, {          # live stream          'url': 'npo:LI_NL1_4188102',          'only_matching': True, +    }, { +        'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -338,7 +344,7 @@ class NPOLiveIE(NPOBaseIE):          webpage = self._download_webpage(url, display_id)          live_id = self._search_regex( -            r'data-prid="([^"]+)"', webpage, 'live id') +            [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')          return {              '_type': 'url_transparent', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7fe79cb53..18ead9426 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor):          vcodec = 'none' if data.get('mediaType') == 'Audio' else None -        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged -          for entry in entries:              entry.update(common_info)              for f in entry['formats']:                  f['vcodec'] = vcodec +        points = data.get('shortIndexPoints') +        if isinstance(points, list): +            chapters = [] +            for next_num, point in enumerate(points, start=1): +                if not isinstance(point, dict): +                    continue +                start_time = parse_duration(point.get('startPoint')) +                if start_time is None: +                    continue +                end_time = parse_duration( +                    data.get('duration') +                    if next_num == len(points) +                    else points[next_num].get('startPoint')) +                if end_time is None: +                    continue +                chapters.append({ +                    'start_time': start_time, +                    'end_time': end_time, +                    'title': point.get('title'), +                }) +            if chapters and len(entries) == 1: +                entries[0]['chapters'] = chapters +          return self.playlist_result(entries, video_id, title, description) @@ -216,7 +237,7 @@ class NRKTVIE(NRKBaseIE):                              (?:/\d{2}-\d{2}-\d{4})?                              (?:\#del=(?P<part_id>\d+))?                      ''' % _EPISODE_RE -    _API_HOST = 'psapi-we.nrk.no' +    _API_HOST = 'psapi-ne.nrk.no'      _TESTS = [{          'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index 87fb94d1f..be1e09d37 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -10,9 +10,10 @@ from ..utils import (  class NuevoBaseIE(InfoExtractor): -    def _extract_nuevo(self, config_url, video_id): +    def _extract_nuevo(self, config_url, video_id, headers={}):          config = self._download_xml( -            config_url, video_id, transform_source=lambda s: s.strip()) +            config_url, video_id, transform_source=lambda s: s.strip(), +            headers=headers)          title = xpath_text(config, './title', 'title', fatal=True).strip()          video_id = xpath_text(config, './mediaid', default=video_id) diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 94f57990b..58da1bc27 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -11,6 +11,7 @@ from ..utils import (      get_element_by_class,      int_or_none,      js_to_json, +    NO_DEFAULT,      parse_iso8601,      remove_start,      strip_or_none, @@ -199,6 +200,19 @@ class OnetPlIE(InfoExtractor):              'timestamp': 1487078046,          },      }, { +        # embedded via pulsembed +        'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0', +        'info_dict': { +            'id': '501235.965429946', +            'ext': 'mp4', +            'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu', +            'upload_date': '20170622', +            'timestamp': 1498159955, +        }, +        'params': { +            'skip_download': True, +        }, +    }, {          'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3',          'only_matching': True,      }, { @@ -212,13 +226,25 @@ class OnetPlIE(InfoExtractor):          'only_matching': True,      }] +    def _search_mvp_id(self, webpage, default=NO_DEFAULT): +        return self._search_regex( +            r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id', +            default=default) +      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        mvp_id = self._search_regex( -            r'data-params-mvp=["\'](\d+\.\d+)', webpage, 'mvp id') +        mvp_id = self._search_mvp_id(webpage, default=None) + +        if not mvp_id: +            pulsembed_url = self._search_regex( +                r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1', +                webpage, 'pulsembed url', group='url') +            webpage = self._download_webpage( +                pulsembed_url, video_id, 'Downloading pulsembed webpage') +            mvp_id = self._search_mvp_id(webpage)          return self.url_result(              'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 84be2b1e3..52580baed 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -3,12 +3,14 @@ import re  import base64  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import ( -    int_or_none, -    float_or_none, +    determine_ext,      ExtractorError, +    float_or_none, +    int_or_none, +    try_get,      unsmuggle_url, -    determine_ext,  )  from ..compat import compat_urllib_parse_urlencode @@ -39,13 +41,15 @@ class OoyalaBaseIE(InfoExtractor):          formats = []          if cur_auth_data['authorized']:              for stream in cur_auth_data['streams']: -                s_url = base64.b64decode( -                    stream['url']['data'].encode('ascii')).decode('utf-8') -                if s_url in urls: +                url_data = try_get(stream, lambda x: x['url']['data'], compat_str) +                if not url_data: +                    continue +                s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8') +                if not s_url or s_url in urls:                      continue                  urls.append(s_url)                  ext = determine_ext(s_url, None) -                delivery_type = stream['delivery_type'] +                delivery_type = stream.get('delivery_type')                  if delivery_type == 'hls' or ext == 'm3u8':                      formats.extend(self._extract_m3u8_formats(                          re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', @@ -65,7 +69,7 @@ class OoyalaBaseIE(InfoExtractor):                  else:                      formats.append({                          'url': s_url, -                        'ext': ext or stream.get('delivery_type'), +                        'ext': ext or delivery_type,                          'vcodec': stream.get('video_codec'),                          'format_id': delivery_type,                          'width': int_or_none(stream.get('width')), @@ -136,6 +140,11 @@ class OoyalaIE(OoyalaBaseIE):                  'title': 'Divide Tool Path.mp4',                  'duration': 204.405,              } +        }, +        { +            # empty stream['url']['data'] +            'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', +            'only_matching': True,          }      ] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 1e2c54e68..74fe8017e 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -2,20 +2,19 @@  from __future__ import unicode_literals  import re -import calendar -import datetime  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import ( +    determine_ext, +    float_or_none,      HEADRequest, -    unified_strdate, -    strip_jsonp,      int_or_none, -    float_or_none, -    determine_ext, +    orderedSet,      remove_end, +    strip_jsonp,      unescapeHTML, +    unified_strdate,  ) @@ -144,77 +143,25 @@ class ORFTVthekIE(InfoExtractor):          } -class ORFOE1IE(InfoExtractor): -    IE_NAME = 'orf:oe1' -    IE_DESC = 'Radio Österreich 1' -    _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' - -    # Audios on ORF radio are only available for 7 days, so we can't add tests. -    _TESTS = [{ -        'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', -        'only_matching': True, -    }, { -        'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        show_id = self._match_id(url) -        data = self._download_json( -            'http://oe1.orf.at/programm/%s/konsole' % show_id, -            show_id -        ) - -        timestamp = datetime.datetime.strptime('%s %s' % ( -            data['item']['day_label'], -            data['item']['time'] -        ), '%d.%m.%Y %H:%M') -        unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - -        return { -            'id': show_id, -            'title': data['item']['title'], -            'url': data['item']['url_stream'], -            'ext': 'mp3', -            'description': data['item'].get('info'), -            'timestamp': unix_timestamp -        } - - -class ORFFM4IE(InfoExtractor): -    IE_NAME = 'orf:fm4' -    IE_DESC = 'radio FM4' -    _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' - -    _TEST = { -        'url': 'http://fm4.orf.at/player/20160110/IS/', -        'md5': '01e736e8f1cef7e13246e880a59ad298', -        'info_dict': { -            'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', -            'ext': 'mp3', -            'title': 'Im Sumpf', -            'description': 'md5:384c543f866c4e422a55f66a62d669cd', -            'duration': 7173, -            'timestamp': 1452456073, -            'upload_date': '20160110', -        }, -        'skip': 'Live streams on FM4 got deleted soon', -    } - +class ORFRadioIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) +        station = mobj.group('station')          show_date = mobj.group('date')          show_id = mobj.group('show') +        if station == 'fm4': +            show_id = '4%s' % show_id +          data = self._download_json( -            'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), +            'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),              show_id          )          def extract_entry_dict(info, title, subtitle):              return {                  'id': info['loopStreamId'].replace('.mp3', ''), -                'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], +                'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),                  'title': title,                  'description': subtitle,                  'duration': (info['end'] - info['start']) / 1000, @@ -233,6 +180,47 @@ class ORFFM4IE(InfoExtractor):          } +class ORFFM4IE(ORFRadioIE): +    IE_NAME = 'orf:fm4' +    IE_DESC = 'radio FM4' +    _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + +    _TEST = { +        'url': 'http://fm4.orf.at/player/20170107/CC', +        'md5': '2b0be47375432a7ef104453432a19212', +        'info_dict': { +            'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', +            'ext': 'mp3', +            'title': 'Solid Steel Radioshow', +            'description': 'Die Mixshow von Coldcut und Ninja Tune.', +            'duration': 3599, +            'timestamp': 1483819257, +            'upload_date': '20170107', +        }, +        'skip': 'Shows from ORF radios are only available for 7 days.' +    } + + +class ORFOE1IE(ORFRadioIE): +    IE_NAME = 'orf:oe1' +    IE_DESC = 'Radio Österreich 1' +    _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + +    _TEST = { +        'url': 'http://oe1.orf.at/player/20170108/456544', +        'md5': '34d8a6e67ea888293741c86a099b745b', +        'info_dict': { +            'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', +            'ext': 'mp3', +            'title': 'Morgenjournal', +            'duration': 609, +            'timestamp': 1483858796, +            'upload_date': '20170108', +        }, +        'skip': 'Shows from ORF radios are only available for 7 days.' +    } + +  class ORFIPTVIE(InfoExtractor):      IE_NAME = 'orf:iptv'      IE_DESC = 'iptv.ORF.at' @@ -320,3 +308,108 @@ class ORFIPTVIE(InfoExtractor):              'upload_date': upload_date,              'formats': formats,          } + + +class ORFFM4StoryIE(InfoExtractor): +    IE_NAME = 'orf:fm4:story' +    IE_DESC = 'fm4.orf.at stories' +    _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://fm4.orf.at/stories/2865738/', +        'playlist': [{ +            'md5': 'e1c2c706c45c7b34cf478bbf409907ca', +            'info_dict': { +                'id': '547792', +                'ext': 'flv', +                'title': 'Manu Delago und Inner Tongue live', +                'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', +                'duration': 1748.52, +                'thumbnail': r're:^https?://.*\.jpg$', +                'upload_date': '20170913', +            }, +        }, { +            'md5': 'c6dd2179731f86f4f55a7b49899d515f', +            'info_dict': { +                'id': '547798', +                'ext': 'flv', +                'title': 'Manu Delago und Inner Tongue live (2)', +                'duration': 1504.08, +                'thumbnail': r're:^https?://.*\.jpg$', +                'upload_date': '20170913', +                'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', +            }, +        }], +    } + +    def _real_extract(self, url): +        story_id = self._match_id(url) +        webpage = self._download_webpage(url, story_id) + +        entries = [] +        all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) +        for idx, video_id in enumerate(all_ids): +            data = self._download_json( +                'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, +                video_id)[0] + +            duration = float_or_none(data['duration'], 1000) + +            video = data['sources']['q8c'] +            load_balancer_url = video['loadBalancerUrl'] +            abr = int_or_none(video.get('audioBitrate')) +            vbr = int_or_none(video.get('bitrate')) +            fps = int_or_none(video.get('videoFps')) +            width = int_or_none(video.get('videoWidth')) +            height = int_or_none(video.get('videoHeight')) +            thumbnail = video.get('preview') + +            rendition = self._download_json( +                load_balancer_url, video_id, transform_source=strip_jsonp) + +            f = { +                'abr': abr, +                'vbr': vbr, +                'fps': fps, +                'width': width, +                'height': height, +            } + +            formats = [] +            for format_id, format_url in rendition['redirect'].items(): +                if format_id == 'rtmp': +                    ff = f.copy() +                    ff.update({ +                        'url': format_url, +                        'format_id': format_id, +                    }) +                    formats.append(ff) +                elif determine_ext(format_url) == 'f4m': +                    formats.extend(self._extract_f4m_formats( +                        format_url, video_id, f4m_id=format_id)) +                elif determine_ext(format_url) == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, video_id, 'mp4', m3u8_id=format_id)) +                else: +                    continue +            self._sort_formats(formats) + +            title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') +            if idx >= 1: +                # Titles are duplicates, make them unique +                title += ' (' + str(idx + 1) + ')' +            description = self._og_search_description(webpage) +            upload_date = unified_strdate(self._html_search_meta( +                'dc.date', webpage, 'upload date')) + +            entries.append({ +                'id': video_id, +                'title': title, +                'description': description, +                'duration': duration, +                'thumbnail': thumbnail, +                'upload_date': upload_date, +                'formats': formats, +            }) + +        return self.playlist_result(entries) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 881f3bcc7..8ed3c6347 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -1,9 +1,13 @@  from __future__ import unicode_literals +import json  import re  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_str, +    compat_HTTPError, +)  from ..utils import (      clean_html,      ExtractorError, @@ -34,6 +38,25 @@ class PacktPubIE(PacktPubBaseIE):              'upload_date': '20170331',          },      } +    _NETRC_MACHINE = 'packtpub' +    _TOKEN = None + +    def _real_initialize(self): +        (username, password) = self._get_login_info() +        if username is None: +            return +        try: +            self._TOKEN = self._download_json( +                self._MAPT_REST + '/users/tokens', None, +                'Downloading Authorization Token', data=json.dumps({ +                    'email': username, +                    'password': password, +                }).encode())['data']['access'] +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): +                message = self._parse_json(e.cause.read().decode(), None)['message'] +                raise ExtractorError(message, expected=True) +            raise      def _handle_error(self, response):          if response.get('status') != 'success': @@ -51,14 +74,17 @@ class PacktPubIE(PacktPubBaseIE):          course_id, chapter_id, video_id = mobj.group(              'course_id', 'chapter_id', 'id') +        headers = {} +        if self._TOKEN: +            headers['Authorization'] = 'Bearer ' + self._TOKEN          video = self._download_json(              '%s/users/me/products/%s/chapters/%s/sections/%s'              % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, -            'Downloading JSON video')['data'] +            'Downloading JSON video', headers=headers)['data']          content = video.get('content')          if not content: -            raise ExtractorError('This video is locked', expected=True) +            self.raise_login_required('This video is locked')          video_url = content['file'] diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py index 133cc9b88..c86d70771 100644 --- a/youtube_dl/extractor/pandatv.py +++ b/youtube_dl/extractor/pandatv.py @@ -10,13 +10,13 @@ from ..utils import (  class PandaTVIE(InfoExtractor):      IE_DESC = '熊猫TV' -    _VALID_URL = r'http://(?:www\.)?panda\.tv/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.panda.tv/10091', +    _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://www.panda.tv/66666',          'info_dict': { -            'id': '10091', +            'id': '66666',              'title': 're:.+', -            'uploader': '囚徒', +            'uploader': '刘杀鸡',              'ext': 'flv',              'is_live': True,          }, @@ -24,13 +24,16 @@ class PandaTVIE(InfoExtractor):              'skip_download': True,          },          'skip': 'Live stream is offline', -    } +    }, { +        'url': 'https://www.panda.tv/66666', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          config = self._download_json( -            'http://www.panda.tv/api_room?roomid=%s' % video_id, video_id) +            'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id)          error_code = config.get('errno', 0)          if error_code is not 0: @@ -74,7 +77,7 @@ class PandaTVIE(InfoExtractor):                  continue              for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))):                  formats.append({ -                    'url': 'http://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' +                    'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s'                      % (pl, plflag1, room_key, live_panda, suffix[quality], ext),                      'format_id': '%s-%s' % (k, ext),                      'quality': quality, diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 89c95fffb..fc7bd3411 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -19,7 +19,7 @@ class PandoraTVIE(InfoExtractor):      IE_NAME = 'pandora.tv'      IE_DESC = '판도라TV'      _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' -    _TEST = { +    _TESTS = [{          'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2',          'info_dict': {              'id': '53294230', @@ -34,7 +34,26 @@ class PandoraTVIE(InfoExtractor):              'view_count': int,              'like_count': int,          } -    } +    }, { +        'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', +        'info_dict': { +            'id': '54721744', +            'ext': 'flv', +            'title': '[HD] JAPAN COUNTDOWN 170423', +            'description': '[HD] JAPAN COUNTDOWN 170423', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 1704.9, +            'upload_date': '20170423', +            'uploader': 'GOGO_UCC', +            'uploader_id': 'gogoucc', +            'view_count': int, +            'like_count': int, +        }, +        'params': { +            # Test metadata only +            'skip_download': True, +        }, +    }]      def _real_extract(self, url):          qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) @@ -86,7 +105,7 @@ class PandoraTVIE(InfoExtractor):              'description': info.get('body'),              'thumbnail': info.get('thumbnail') or info.get('poster'),              'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), -            'upload_date': info['fid'][:8] if isinstance(info.get('fid'), compat_str) else None, +            'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None,              'uploader': info.get('nickname'),              'uploader_id': info.get('upload_userid'),              'view_count': str_to_int(info.get('hit')), diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 0727e381b..8889e4a1a 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -10,6 +10,7 @@ from ..utils import (      int_or_none,      float_or_none,      js_to_json, +    orderedSet,      strip_jsonp,      strip_or_none,      unified_strdate, @@ -188,7 +189,7 @@ class PBSIE(InfoExtractor):             # Direct video URL             (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |             # Article with embedded player (or direct video) -           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | +           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |             # Player             (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/          ) @@ -265,6 +266,13 @@ class PBSIE(InfoExtractor):              'playlist_count': 2,          },          { +            'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', +            'info_dict': { +                'id': 'great-war', +            }, +            'playlist_count': 3, +        }, +        {              'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',              'info_dict': {                  'id': '2276541483', @@ -338,6 +346,21 @@ class PBSIE(InfoExtractor):              },          },          { +            # https://github.com/rg3/youtube-dl/issues/13801 +            'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', +            'info_dict': { +                'id': '3003333873', +                'ext': 'mp4', +                'title': 'PBS NewsHour - full episode July 31, 2017', +                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', +                'duration': 3265, +                'thumbnail': r're:^https?://.*\.jpg$', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        {              'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',              'only_matching': True,          }, @@ -382,10 +405,10 @@ class PBSIE(InfoExtractor):              # tabbed frontline videos              MULTI_PART_REGEXES = (                  r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', -                r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', +                r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',              )              for p in MULTI_PART_REGEXES: -                tabbed_videos = re.findall(p, webpage) +                tabbed_videos = orderedSet(re.findall(p, webpage))                  if tabbed_videos:                      return tabbed_videos, presumptive_id, upload_date, description @@ -425,6 +448,9 @@ class PBSIE(InfoExtractor):                  if url:                      break +            if not url: +                url = self._og_search_url(webpage) +              mobj = re.match(self._VALID_URL, url)          player_id = mobj.group('player_id') diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py new file mode 100644 index 000000000..1d777221c --- /dev/null +++ b/youtube_dl/extractor/pearvideo.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    qualities, +    unified_timestamp, +) + + +class PearVideoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.pearvideo.com/video_1076290', +        'info_dict': { +            'id': '1076290', +            'ext': 'mp4', +            'title': '小浣熊在主人家玻璃上滚石头:没砸', +            'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', +            'timestamp': 1494275280, +            'upload_date': '20170508', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        quality = qualities( +            ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + +        formats = [{ +            'url': mobj.group('url'), +            'format_id': mobj.group('id'), +            'quality': quality(mobj.group('id')), +        } for mobj in re.finditer( +            r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2', +            webpage)] +        self._sort_formats(formats) + +        title = self._search_regex( +            (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)', +             r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'), +            webpage, 'title', group='value') +        description = self._search_regex( +            (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)', +             r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'), +            webpage, 'description', default=None, +            group='value') or self._html_search_meta('Description', webpage) +        timestamp = unified_timestamp(self._search_regex( +            r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)', +            webpage, 'timestamp', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'timestamp': timestamp, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 1add6b840..e5e08538c 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -49,7 +49,7 @@ class PeriscopeIE(PeriscopeBaseIE):      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage) +            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)          if mobj:              return mobj.group('url') @@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE):          stream = self._call_api(              'getAccessPublic', {'broadcast_id': token}, token) +        video_urls = set()          formats = [] -        for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): +        for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):              video_url = stream.get(format_id + '_url') -            if not video_url: +            if not video_url or video_url in video_urls:                  continue -            f = { +            video_urls.add(video_url) +            if format_id != 'rtmp': +                formats.extend(self._extract_m3u8_formats( +                    video_url, token, 'mp4', +                    entry_protocol='m3u8_native' +                    if state in ('ended', 'timed_out') else 'm3u8', +                    m3u8_id=format_id, fatal=False)) +                continue +            formats.append({                  'url': video_url,                  'ext': 'flv' if format_id == 'rtmp' else 'mp4', -            } -            if format_id != 'rtmp': -                f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8' -            formats.append(f) +            })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e45d9fe55..f6a9131b1 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -18,6 +18,7 @@ from ..utils import (      parse_duration,      qualities,      srt_subtitles_timecode, +    try_get,      update_url_query,      urlencode_postdata,  ) @@ -26,6 +27,39 @@ from ..utils import (  class PluralsightBaseIE(InfoExtractor):      _API_BASE = 'https://app.pluralsight.com' +    def _download_course(self, course_id, url, display_id): +        try: +            return self._download_course_rpc(course_id, url, display_id) +        except ExtractorError: +            # Old API fallback +            return self._download_json( +                'https://app.pluralsight.com/player/user/api/v1/player/payload', +                display_id, data=urlencode_postdata({'courseId': course_id}), +                headers={'Referer': url}) + +    def _download_course_rpc(self, course_id, url, display_id): +        response = self._download_json( +            '%s/player/functions/rpc' % self._API_BASE, display_id, +            'Downloading course JSON', +            data=json.dumps({ +                'fn': 'bootstrapPlayer', +                'payload': { +                    'courseId': course_id, +                }, +            }).encode('utf-8'), +            headers={ +                'Content-Type': 'application/json;charset=utf-8', +                'Referer': url, +            }) + +        course = try_get(response, lambda x: x['payload']['course'], dict) +        if course: +            return course + +        raise ExtractorError( +            '%s said: %s' % (self.IE_NAME, response['error']['message']), +            expected=True) +  class PluralsightIE(PluralsightBaseIE):      IE_NAME = 'pluralsight' @@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE):          display_id = '%s-%s' % (name, clip_id) -        course = self._download_json( -            'https://app.pluralsight.com/player/user/api/v1/player/payload', -            display_id, data=urlencode_postdata({'courseId': course_name}), -            headers={'Referer': url}) +        course = self._download_course(course_name, url, display_id)          collection = course['modules'] @@ -224,6 +255,7 @@ class PluralsightIE(PluralsightBaseIE):                  req_format_split = req_format.split('-', 1)                  if len(req_format_split) > 1:                      req_ext, req_quality = req_format_split +                    req_quality = '-'.join(req_quality.split('-')[:2])                      for allowed_quality in ALLOWED_QUALITIES:                          if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:                              return (AllowedQuality(req_ext, (req_quality, )), ) @@ -330,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE):          # TODO: PSM cookie -        course = self._download_json( -            '%s/player/functions/rpc' % self._API_BASE, course_id, -            'Downloading course JSON', -            data=json.dumps({ -                'fn': 'bootstrapPlayer', -                'payload': { -                    'courseId': course_id, -                } -            }).encode('utf-8'), -            headers={ -                'Content-Type': 'application/json;charset=utf-8' -            })['payload']['course'] +        course = self._download_course(course_id, url, course_id)          title = course['title']          course_name = course['name'] diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index f20946a2b..25fcebf9f 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -9,39 +9,46 @@ from ..utils import int_or_none  class PodomaticIE(InfoExtractor):      IE_NAME = 'podomatic' -    _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' +    _VALID_URL = r'''(?x) +                    (?P<proto>https?):// +                        (?: +                            (?P<channel>[^.]+)\.podomatic\.com/entry| +                            (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes +                        )/ +                        (?P<id>[^/?#&]+) +                ''' -    _TESTS = [ -        { -            'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', -            'md5': '84bb855fcf3429e6bf72460e1eed782d', -            'info_dict': { -                'id': '2009-01-02T16_03_35-08_00', -                'ext': 'mp3', -                'uploader': 'Science Teaching Tips', -                'uploader_id': 'scienceteachingtips', -                'title': '64.  When the Moon Hits Your Eye', -                'duration': 446, -            } -        }, -        { -            'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', -            'md5': 'd2cf443931b6148e27638650e2638297', -            'info_dict': { -                'id': '2013-11-15T16_31_21-08_00', -                'ext': 'mp3', -                'uploader': 'Ostbahnhof / Techno Mix', -                'uploader_id': 'ostbahnhof', -                'title': 'Einunddreizig', -                'duration': 3799, -            } -        }, -    ] +    _TESTS = [{ +        'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', +        'md5': '84bb855fcf3429e6bf72460e1eed782d', +        'info_dict': { +            'id': '2009-01-02T16_03_35-08_00', +            'ext': 'mp3', +            'uploader': 'Science Teaching Tips', +            'uploader_id': 'scienceteachingtips', +            'title': '64.  When the Moon Hits Your Eye', +            'duration': 446, +        } +    }, { +        'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', +        'md5': 'd2cf443931b6148e27638650e2638297', +        'info_dict': { +            'id': '2013-11-15T16_31_21-08_00', +            'ext': 'mp3', +            'uploader': 'Ostbahnhof / Techno Mix', +            'uploader_id': 'ostbahnhof', +            'title': 'Einunddreizig', +            'duration': 3799, +        } +    }, { +        'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', +        'only_matching': True, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        channel = mobj.group('channel') +        channel = mobj.group('channel') or mobj.group('channel_2')          json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +                       '?permalink=true&rtmp=0') % diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index 2ac1fcb0b..978d6f813 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -65,7 +65,7 @@ class PolskieRadioIE(InfoExtractor):          webpage = self._download_webpage(url, playlist_id)          content = self._search_regex( -            r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', +            r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',              webpage, 'content')          timestamp = unified_timestamp(self._html_search_regex( diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 842317e6c..b52879c7a 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):               r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')          sources = self._parse_json(js_to_json(self._search_regex( -            r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", +            r"(?s)sources'?\s*[:=]\s*(\{.+?\})",              webpage, 'sources', default='{}')), video_id)          if not sources: @@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor):          view_count = int_or_none(self._html_search_regex(              r'(\d+) views\s*<', webpage, 'view count', fatal=False))          thumbnail = self._search_regex( -            r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) +            r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage, +            'thumbnail', fatal=False, group='url')          return {              'id': video_id, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b25f1f193..3428458af 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor):      _VALID_URL = r'''(?x)                      https?://                          (?: -                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| +                            (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|                              (?:www\.)?thumbzilla\.com/video/                          )                          (?P<id>[\da-z]+) @@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor):      }, {          'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',          'only_matching': True, +    }, { +        'url': 'http://www.pornhub.com/video/show?viewkey=648719015', +        'only_matching': True,      }]      @staticmethod @@ -183,7 +186,7 @@ class PornHubIE(InfoExtractor):              title, thumbnail, duration = [None] * 3          video_uploader = self._html_search_regex( -            r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', +            r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',              webpage, 'uploader', fatal=False)          view_count = self._extract_count( @@ -224,13 +227,20 @@ class PornHubIE(InfoExtractor):  class PornHubPlaylistBaseIE(InfoExtractor):      def _extract_entries(self, webpage): +        # Only process container div with main playlist content skipping +        # drop-down menu that uses similar pattern for videos (see +        # https://github.com/rg3/youtube-dl/issues/11594). +        container = self._search_regex( +            r'(?s)(<div[^>]+class=["\']container.+)', webpage, +            'container', default=webpage) +          return [              self.url_result(                  'http://www.pornhub.com/%s' % video_url,                  PornHubIE.ie_key(), video_title=title)              for video_url, title in orderedSet(re.findall(                  r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', -                webpage)) +                container))          ]      def _real_extract(self, url): @@ -238,22 +248,18 @@ class PornHubPlaylistBaseIE(InfoExtractor):          webpage = self._download_webpage(url, playlist_id) -        # Only process container div with main playlist content skipping -        # drop-down menu that uses similar pattern for videos (see -        # https://github.com/rg3/youtube-dl/issues/11594). -        container = self._search_regex( -            r'(?s)(<div[^>]+class=["\']container.+)', webpage, -            'container', default=webpage) - -        entries = self._extract_entries(container) +        entries = self._extract_entries(webpage)          playlist = self._parse_json(              self._search_regex( -                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), -            playlist_id) +                r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, +                'playlist', default='{}'), +            playlist_id, fatal=False) +        title = playlist.get('title') or self._search_regex( +            r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)          return self.playlist_result( -            entries, playlist_id, playlist.get('title'), playlist.get('description')) +            entries, playlist_id, title, playlist.get('description'))  class PornHubPlaylistIE(PornHubPlaylistBaseIE): @@ -293,6 +299,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE):              except ExtractorError as e:                  if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:                      break +                raise              page_entries = self._extract_entries(webpage)              if not page_entries:                  break diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 17c27da46..084308aeb 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -2,38 +2,37 @@  from __future__ import unicode_literals  import random -import time  import re +import time  from .common import InfoExtractor  from ..utils import ( -    sanitized_Request, -    strip_jsonp, -    unescapeHTML,      clean_html,      ExtractorError, +    strip_jsonp, +    unescapeHTML,  )  class QQMusicIE(InfoExtractor):      IE_NAME = 'qqmusic'      IE_DESC = 'QQ音乐' -    _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' +    _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html'      _TESTS = [{ -        'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', -        'md5': '9ce1c1c8445f561506d2e3cfb0255705', +        'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', +        'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8',          'info_dict': {              'id': '004295Et37taLD',              'ext': 'mp3',              'title': '可惜没如果',              'release_date': '20141227',              'creator': '林俊杰', -            'description': 'md5:d327722d0361576fde558f1ac68a7065', +            'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac',              'thumbnail': r're:^https?://.*\.jpg$',          }      }, {          'note': 'There is no mp3-320 version of this song.', -        'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', +        'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html',          'md5': 'fa3926f0c585cda0af8fa4f796482e3e',          'info_dict': {              'id': '004MsGEo3DdNxV', @@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor):          }      }, {          'note': 'lyrics not in .lrc format', -        'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', +        'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html',          'info_dict': {              'id': '001JyApY11tIp6',              'ext': 'mp3',              'title': 'Shadows Over Transylvania',              'release_date': '19970225',              'creator': 'Dark Funeral', -            'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', +            'description': 'md5:c9b20210587cbcd6836a1c597bab4525',              'thumbnail': r're:^https?://.*\.jpg$',          },          'params': { @@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor):              [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],              detail_info_page, 'album mid', default=None)          if albummid: -            thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ +            thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \                              % (albummid[-2:-1], albummid[-1], albummid)          guid = self.m_r_get_ruin() @@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor):      def qq_static_url(category, mid):          return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) -    @classmethod -    def get_entries_from_page(cls, page): +    def get_singer_all_songs(self, singmid, num): +        return self._download_webpage( +            r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, +            query={ +                'format': 'json', +                'inCharset': 'utf8', +                'outCharset': 'utf-8', +                'platform': 'yqq', +                'needNewCode': 0, +                'singermid': singmid, +                'order': 'listen', +                'begin': 0, +                'num': num, +                'songstatus': 1, +            }) + +    def get_entries_from_page(self, singmid):          entries = [] -        for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page): -            song_mid = unescapeHTML(item).split('|')[-5] -            entries.append(cls.url_result( -                'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', -                song_mid)) +        default_num = 1 +        json_text = self.get_singer_all_songs(singmid, default_num) +        json_obj_all_songs = self._parse_json(json_text, singmid) + +        if json_obj_all_songs['code'] == 0: +            total = json_obj_all_songs['data']['total'] +            json_text = self.get_singer_all_songs(singmid, total) +            json_obj_all_songs = self._parse_json(json_text, singmid) + +        for item in json_obj_all_songs['data']['list']: +            if item['musicData'].get('songmid') is not None: +                songmid = item['musicData']['songmid'] +                entries.append(self.url_result( +                    r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid))          return entries @@ -172,42 +195,32 @@ class QQPlaylistBaseIE(InfoExtractor):  class QQMusicSingerIE(QQPlaylistBaseIE):      IE_NAME = 'qqmusic:singer'      IE_DESC = 'QQ音乐 - 歌手' -    _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' +    _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html'      _TEST = { -        'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', +        'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html',          'info_dict': {              'id': '001BLpXF2DyJe2',              'title': '林俊杰',              'description': 'md5:870ec08f7d8547c29c93010899103751',          }, -        'playlist_count': 12, +        'playlist_mincount': 12,      }      def _real_extract(self, url):          mid = self._match_id(url) -        singer_page = self._download_webpage( -            self.qq_static_url('singer', mid), mid, 'Download singer page') - -        entries = self.get_entries_from_page(singer_page) - +        entries = self.get_entries_from_page(mid) +        singer_page = self._download_webpage(url, mid, 'Download singer page')          singer_name = self._html_search_regex( -            r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', -            default=None) - -        singer_id = self._html_search_regex( -            r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', -            default=None) - +            r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None)          singer_desc = None -        if singer_id: -            req = sanitized_Request( -                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) -            req.add_header( -                'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') +        if mid:              singer_desc_page = self._download_xml( -                req, mid, 'Donwload singer description XML') +                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, +                'Donwload singer description XML', +                query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, +                headers={'Referer': 'https://y.qq.com/n/yqq/singer/'})              singer_desc = singer_desc_page.find('./data/info/desc').text @@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE):  class QQMusicAlbumIE(QQPlaylistBaseIE):      IE_NAME = 'qqmusic:album'      IE_DESC = 'QQ音乐 - 专辑' -    _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' +    _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html'      _TESTS = [{ -        'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', +        'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html',          'info_dict': {              'id': '000gXCTb2AhRR1',              'title': '我们都是这样长大的', @@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):          },          'playlist_count': 4,      }, { -        'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', +        'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html',          'info_dict': {              'id': '002Y5a3b3AlCu3',              'title': '그리고...', @@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):          entries = [              self.url_result( -                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] +                'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']              ) for song in album['list']          ]          album_name = album.get('name') @@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):  class QQMusicToplistIE(QQPlaylistBaseIE):      IE_NAME = 'qqmusic:toplist'      IE_DESC = 'QQ音乐 - 排行榜' -    _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' +    _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html'      _TESTS = [{ -        'url': 'http://y.qq.com/#type=toplist&p=global_123', +        'url': 'https://y.qq.com/n/yqq/toplist/123.html',          'info_dict': { -            'id': 'global_123', +            'id': '123',              'title': '美国iTunes榜', +            'description': 'md5:89db2335fdbb10678dee2d43fe9aba08',          }, -        'playlist_count': 10, +        'playlist_count': 100,      }, { -        'url': 'http://y.qq.com/#type=toplist&p=top_3', +        'url': 'https://y.qq.com/n/yqq/toplist/3.html',          'info_dict': { -            'id': 'top_3', +            'id': '3',              'title': '巅峰榜·欧美', -            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' -                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' -                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' -                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' +            'description': 'md5:5a600d42c01696b26b71f8c4d43407da',          },          'playlist_count': 100,      }, { -        'url': 'http://y.qq.com/#type=toplist&p=global_106', +        'url': 'https://y.qq.com/n/yqq/toplist/106.html',          'info_dict': { -            'id': 'global_106', +            'id': '106',              'title': '韩国Mnet榜', +            'description': 'md5:cb84b325215e1d21708c615cac82a6e7',          },          'playlist_count': 50,      }] @@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE):      def _real_extract(self, url):          list_id = self._match_id(url) -        list_type, num_id = list_id.split("_") -          toplist_json = self._download_json( -            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' -            % (list_type, num_id), -            list_id, 'Download toplist page') +            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, +            note='Download toplist page', +            query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) -        entries = [ -            self.url_result( -                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] -            ) for song in toplist_json['songlist'] -        ] +        entries = [self.url_result( +            'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', +            song['data']['songmid']) +            for song in toplist_json['songlist']]          topinfo = toplist_json.get('topinfo', {})          list_name = topinfo.get('ListName') @@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE):  class QQMusicPlaylistIE(QQPlaylistBaseIE):      IE_NAME = 'qqmusic:playlist'      IE_DESC = 'QQ音乐 - 歌单' -    _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' +    _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html'      _TESTS = [{ -        'url': 'http://y.qq.com/#type=taoge&id=3462654915', +        'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html',          'info_dict': {              'id': '3462654915',              'title': '韩国5月新歌精选下旬', @@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):          'playlist_count': 40,          'skip': 'playlist gone',      }, { -        'url': 'http://y.qq.com/#type=taoge&id=1374105607', +        'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html',          'info_dict': {              'id': '1374105607',              'title': '易入人心的华语民谣', @@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):          list_id = self._match_id(url)          list_json = self._download_json( -            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' -            % list_id, list_id, 'Download list page', +            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', +            list_id, 'Download list page', +            query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id},              transform_source=strip_jsonp)          if not len(list_json.get('cdlist', [])):              if list_json.get('code'): @@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):              raise ExtractorError('Unable to get playlist info')          cdlist = list_json['cdlist'][0] -        entries = [ -            self.url_result( -                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] -            ) for song in cdlist['songlist'] -        ] +        entries = [self.url_result( +            'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) +            for song in cdlist['songlist']]          list_name = cdlist.get('dissname')          list_description = clean_html(unescapeHTML(cdlist.get('desc'))) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..b952e59b4 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -20,20 +20,37 @@ from ..utils import (  class RadioCanadaIE(InfoExtractor):      IE_NAME = 'radiocanada'      _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', -        'info_dict': { -            'id': '7184272', -            'ext': 'mp4', -            'title': 'Le parcours du tireur capté sur vidéo', -            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', -            'upload_date': '20141023', -        }, -        'params': { -            # m3u8 download -            'skip_download': True, +    _TESTS = [ +        { +            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', +            'info_dict': { +                'id': '7184272', +                'ext': 'mp4', +                'title': 'Le parcours du tireur capté sur vidéo', +                'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', +                'upload_date': '20141023', +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            }          }, -    } +        { +            # empty Title +            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', +            'info_dict': { +                'id': '7754998', +                'ext': 'mp4', +                'title': 'letelejournal22h', +                'description': 'INTEGRALE WEB 22H-TJ', +                'upload_date': '20170720', +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            }, +        } +    ]      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -59,6 +76,7 @@ class RadioCanadaIE(InfoExtractor):              device_types.append('android')          formats = [] +        error = None          # TODO: extract f4m formats          # f4m formats can be extracted using flashhd device_type but they produce unplayable file          for device_type in device_types: @@ -84,8 +102,8 @@ class RadioCanadaIE(InfoExtractor):              if not v_url:                  continue              if v_url == 'null': -                raise ExtractorError('%s said: %s' % ( -                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True) +                error = xpath_text(v_data, 'message') +                continue              ext = determine_ext(v_url)              if ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats( @@ -129,6 +147,9 @@ class RadioCanadaIE(InfoExtractor):                              formats.extend(self._extract_f4m_formats(                                  base_url + '/manifest.f4m', video_id,                                  f4m_id='hds', fatal=False)) +        if not formats and error: +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, error), expected=True)          self._sort_formats(formats)          subtitles = {} @@ -141,7 +162,7 @@ class RadioCanadaIE(InfoExtractor):          return {              'id': video_id, -            'title': get_meta('Title'), +            'title': get_meta('Title') or get_meta('AV-nomEmission'),              'description': get_meta('Description') or get_meta('ShortDescription'),              'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),              'duration': int_or_none(get_meta('length')), diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 81eb9db85..5bf64a56b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -191,11 +191,12 @@ class RaiPlayIE(RaiBaseIE):          info = {              'id': video_id, -            'title': title, +            'title': self._live_title(title) if relinker_info.get( +                'is_live') else title,              'alt_title': media.get('subtitle'),              'description': media.get('description'), -            'uploader': media.get('channel'), -            'creator': media.get('editor'), +            'uploader': strip_or_none(media.get('channel')), +            'creator': strip_or_none(media.get('editor')),              'duration': parse_duration(video.get('duration')),              'timestamp': timestamp,              'thumbnails': thumbnails, @@ -208,10 +209,46 @@ class RaiPlayIE(RaiBaseIE):          }          info.update(relinker_info) -          return info +class RaiPlayLiveIE(RaiBaseIE): +    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'http://www.raiplay.it/dirette/rainews24', +        'info_dict': { +            'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', +            'display_id': 'rainews24', +            'ext': 'mp4', +            'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:6eca31500550f9376819f174e5644754', +            'uploader': 'Rai News 24', +            'creator': 'Rai News 24', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, +            webpage, 'content id') + +        return { +            '_type': 'url_transparent', +            'ie_key': RaiPlayIE.ie_key(), +            'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, +            'id': video_id, +            'display_id': display_id, +        } + +  class RaiIE(RaiBaseIE):      _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE      _TESTS = [{ @@ -308,11 +345,11 @@ class RaiIE(RaiBaseIE):          media_type = media['type']          if 'Audio' in media_type:              relinker_info = { -                'formats': { +                'formats': [{                      'format_id': media.get('formatoAudio'),                      'url': media['audioUrl'],                      'ext': media.get('formatoAudio'), -                } +                }]              }          elif 'Video' in media_type:              relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index afab62426..5d6cc3610 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -13,7 +13,7 @@ from ..utils import (  class RedBullTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)' +    _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?P<id>AP-\w+)'      _TESTS = [{          # film          'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', @@ -42,6 +42,22 @@ class RedBullTVIE(InfoExtractor):              'season_number': 2,              'episode_number': 4,          }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # segment +        'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals', +        'info_dict': { +            'id': 'AP-1QSAQJ6V52111', +            'ext': 'mp4', +            'title': 'Semi Finals - Vans Park Series Pro Tour', +            'description': 'md5:306a2783cdafa9e65e39aa62f514fd97', +            'duration': 11791.991, +        }, +        'params': { +            'skip_download': True, +        },      }, {          'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion',          'only_matching': True, @@ -82,7 +98,8 @@ class RedBullTVIE(InfoExtractor):          title = info['title'].strip()          formats = self._extract_m3u8_formats( -            video['url'], video_id, 'mp4', 'm3u8_native') +            video['url'], video_id, 'mp4', entry_protocol='m3u8_native', +            m3u8_id='hls')          self._sort_formats(formats)          subtitles = {} diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py new file mode 100644 index 000000000..01c85ee01 --- /dev/null +++ b/youtube_dl/extractor/reddit.py @@ -0,0 +1,114 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    float_or_none, +) + + +class RedditIE(InfoExtractor): +    _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' +    _TEST = { +        # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ +        'url': 'https://v.redd.it/zv89llsvexdz', +        'md5': '655d06ace653ea3b87bccfb1b27ec99d', +        'info_dict': { +            'id': 'zv89llsvexdz', +            'ext': 'mp4', +            'title': 'zv89llsvexdz', +        }, +        'params': { +            'format': 'bestvideo', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        formats = self._extract_m3u8_formats( +            'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, +            'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + +        formats.extend(self._extract_mpd_formats( +            'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, +            mpd_id='dash', fatal=False)) + +        return { +            'id': video_id, +            'title': video_id, +            'formats': formats, +        } + + +class RedditRIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)' +    _TESTS = [{ +        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', +        'info_dict': { +            'id': 'zv89llsvexdz', +            'ext': 'mp4', +            'title': 'That small heart attack.', +            'thumbnail': r're:^https?://.*\.jpg$', +            'timestamp': 1501941939, +            'upload_date': '20170805', +            'uploader': 'Antw87', +            'like_count': int, +            'dislike_count': int, +            'comment_count': int, +            'age_limit': 0, +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        }, +    }, { +        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', +        'only_matching': True, +    }, { +        # imgur +        'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', +        'only_matching': True, +    }, { +        # streamable +        'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', +        'only_matching': True, +    }, { +        # youtube +        'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        data = self._download_json( +            url + '.json', video_id)[0]['data']['children'][0]['data'] + +        video_url = data['url'] + +        # Avoid recursing into the same reddit URL +        if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: +            raise ExtractorError('No media found', expected=True) + +        over_18 = data.get('over_18') +        if over_18 is True: +            age_limit = 18 +        elif over_18 is False: +            age_limit = 0 +        else: +            age_limit = None + +        return { +            '_type': 'url_transparent', +            'url': video_url, +            'title': data.get('title'), +            'thumbnail': data.get('thumbnail'), +            'timestamp': float_or_none(data.get('created_utc')), +            'uploader': data.get('author'), +            'like_count': int_or_none(data.get('ups')), +            'dislike_count': int_or_none(data.get('downs')), +            'comment_count': int_or_none(data.get('num_comments')), +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index c367a6ae7..f70a75256 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      ExtractorError,      int_or_none, @@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor):                          'format_id': format_id,                          'height': int_or_none(format_id),                      }) -        else: +        medias = self._parse_json( +            self._search_regex( +                r'mediaDefinition\s*:\s*(\[.+?\])', webpage, +                'media definitions', default='{}'), +            video_id, fatal=False) +        if medias and isinstance(medias, list): +            for media in medias: +                format_url = media.get('videoUrl') +                if not format_url or not isinstance(format_url, compat_str): +                    continue +                format_id = media.get('quality') +                formats.append({ +                    'url': format_url, +                    'format_id': format_id, +                    'height': int_or_none(format_id), +                }) +        if not formats:              video_url = self._html_search_regex(                  r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')              formats.append({'url': video_url}) @@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor):              r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',              webpage, 'upload date', fatal=False))          duration = int_or_none(self._search_regex( -            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) +            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))          view_count = str_to_int(self._search_regex(              r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',              webpage, 'view count', fatal=False)) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index 2340dae53..e921ca3e6 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor):      _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'      _TEST = { -        'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', +        'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',          'info_dict': { -            'id': '5111223049001', +            'id': '5419055995001',              'ext': 'mp4', -            'title': ': LES HEROS DU 88e ETAGE', -            'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé  la vie d\'innombrables personnes le 11 septembre 2001.', +            'title': 'UN DELICIEUX PROJET', +            'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',              'uploader_id': '1969646226001', -            'upload_date': '20160904', -            'timestamp': 1472951103, +            'upload_date': '20170502', +            'timestamp': 1493745308,          },          'params': { -            # rtmp download              'skip_download': True,          }, -        'skip': 'Only works from France', +        'skip': 'only available for a week',      }      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' @@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) -        brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] -        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) +        if brightcove_legacy_url: +            brightcove_id = compat_parse_qs(compat_urlparse.urlparse( +                brightcove_legacy_url).query)['@videoPlayer'][0] +        else: +            brightcove_id = self._search_regex( +                r'data-video-id=["\'](\d+)', webpage, 'brightcove id') +        return self.url_result( +            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', +            brightcove_id) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 54076de28..3e22998c6 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -15,7 +15,7 @@ class RtlNlIE(InfoExtractor):          https?://(?:www\.)?          (?:              rtlxl\.nl/[^\#]*\#!/[^/]+/| -            rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= +            rtl\.nl/(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=|video/)          )          (?P<id>[0-9a-f-]+)''' @@ -70,6 +70,9 @@ class RtlNlIE(InfoExtractor):      }, {          'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',          'only_matching': True, +    }, { +        'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 889fa7628..89d89b65a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,43 +7,84 @@ import itertools  from .common import InfoExtractor  from ..compat import (      compat_str, +    compat_parse_qs, +    compat_urllib_parse_urlparse,  )  from ..utils import (      determine_ext, -    unified_strdate, +    bool_or_none, +    int_or_none, +    try_get, +    unified_timestamp,  ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): +    def _extract_video(self, video, video_id=None, require_title=True): +        title = video['title'] if require_title else video.get('title') + +        age_limit = video.get('is_adult') +        if age_limit is not None: +            age_limit = 18 if age_limit is True else 0 + +        uploader_id = try_get(video, lambda x: x['author']['id']) +        category = try_get(video, lambda x: x['category']['name']) + +        return { +            'id': video.get('id') or video_id, +            'title': title, +            'description': video.get('description'), +            'thumbnail': video.get('thumbnail_url'), +            'duration': int_or_none(video.get('duration')), +            'uploader': try_get(video, lambda x: x['author']['name']), +            'uploader_id': compat_str(uploader_id) if uploader_id else None, +            'timestamp': unified_timestamp(video.get('created_ts')), +            'category': [category] if category else None, +            'age_limit': age_limit, +            'view_count': int_or_none(video.get('hits')), +            'comment_count': int_or_none(video.get('comments_count')), +            'is_live': bool_or_none(video.get('is_livestream')), +        } + + +class RutubeIE(RutubeBaseIE):      IE_NAME = 'rutube'      IE_DESC = 'Rutube videos'      _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'      _TESTS = [{          'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', +        'md5': '79938ade01294ef7e27574890d0d3769',          'info_dict': {              'id': '3eac3b4561676c17df9132a9a1e62e3e', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Раненный кенгуру забежал в аптеку',              'description': 'http://www.ntdtv.ru ',              'duration': 80,              'uploader': 'NTDRussian',              'uploader_id': '29790', +            'timestamp': 1381943602,              'upload_date': '20131016',              'age_limit': 0,          }, -        'params': { -            # It requires ffmpeg (m3u8 download) -            'skip_download': True, -        },      }, {          'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',          'only_matching': True,      }, {          'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',          'only_matching': True, +    }, { +        'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', +        'only_matching': True, +    }, { +        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', +        'only_matching': True,      }] +    @classmethod +    def suitable(cls, url): +        return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) +      @staticmethod      def _extract_urls(webpage):          return [mobj.group('url') for mobj in re.finditer( @@ -52,12 +93,12 @@ class RutubeIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +          video = self._download_json(              'http://rutube.ru/api/video/%s/?format=json' % video_id,              video_id, 'Downloading video JSON') -        # Some videos don't have the author field -        author = video.get('author') or {} +        info = self._extract_video(video, video_id)          options = self._download_json(              'http://rutube.ru/api/play/options/%s/?format=json' % video_id, @@ -79,19 +120,8 @@ class RutubeIE(InfoExtractor):                  })          self._sort_formats(formats) -        return { -            'id': video['id'], -            'title': video['title'], -            'description': video['description'], -            'duration': video['duration'], -            'view_count': video['hits'], -            'formats': formats, -            'thumbnail': video['thumbnail_url'], -            'uploader': author.get('name'), -            'uploader_id': compat_str(author['id']) if author else None, -            'upload_date': unified_strdate(video['created_ts']), -            'age_limit': 18 if video['is_adult'] else 0, -        } +        info['formats'] = formats +        return info  class RutubeEmbedIE(InfoExtractor): @@ -103,7 +133,8 @@ class RutubeEmbedIE(InfoExtractor):          'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',          'info_dict': {              'id': 'a10e53b86e8f349080f718582ce4c661', -            'ext': 'mp4', +            'ext': 'flv', +            'timestamp': 1387830582,              'upload_date': '20131223',              'uploader_id': '297833',              'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -111,7 +142,7 @@ class RutubeEmbedIE(InfoExtractor):              'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',          },          'params': { -            'skip_download': 'Requires ffmpeg', +            'skip_download': True,          },      }, {          'url': 'http://rutube.ru/play/embed/8083783', @@ -125,10 +156,51 @@ class RutubeEmbedIE(InfoExtractor):          canonical_url = self._html_search_regex(              r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,              'Canonical URL') -        return self.url_result(canonical_url, 'Rutube') +        return self.url_result(canonical_url, RutubeIE.ie_key()) + + +class RutubePlaylistBaseIE(RutubeBaseIE): +    def _next_page_url(self, page_num, playlist_id, *args, **kwargs): +        return self._PAGE_TEMPLATE % (playlist_id, page_num) +    def _entries(self, playlist_id, *args, **kwargs): +        next_page_url = None +        for pagenum in itertools.count(1): +            page = self._download_json( +                next_page_url or self._next_page_url( +                    pagenum, playlist_id, *args, **kwargs), +                playlist_id, 'Downloading page %s' % pagenum) + +            results = page.get('results') +            if not results or not isinstance(results, list): +                break + +            for result in results: +                video_url = result.get('video_url') +                if not video_url or not isinstance(video_url, compat_str): +                    continue +                entry = self._extract_video(result, require_title=False) +                entry.update({ +                    '_type': 'url', +                    'url': video_url, +                    'ie_key': RutubeIE.ie_key(), +                }) +                yield entry -class RutubeChannelIE(InfoExtractor): +            next_page_url = page.get('next') +            if not next_page_url or not page.get('has_next'): +                break + +    def _extract_playlist(self, playlist_id, *args, **kwargs): +        return self.playlist_result( +            self._entries(playlist_id, *args, **kwargs), +            playlist_id, kwargs.get('playlist_name')) + +    def _real_extract(self, url): +        return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE):      IE_NAME = 'rutube:channel'      IE_DESC = 'Rutube channels'      _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' @@ -142,27 +214,8 @@ class RutubeChannelIE(InfoExtractor):      _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' -    def _extract_videos(self, channel_id, channel_title=None): -        entries = [] -        for pagenum in itertools.count(1): -            page = self._download_json( -                self._PAGE_TEMPLATE % (channel_id, pagenum), -                channel_id, 'Downloading page %s' % pagenum) -            results = page['results'] -            if not results: -                break -            entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) -            if not page['has_next']: -                break -        return self.playlist_result(entries, channel_id, channel_title) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        channel_id = mobj.group('id') -        return self._extract_videos(channel_id) - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE):      IE_NAME = 'rutube:movie'      IE_DESC = 'Rutube movies'      _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' @@ -176,11 +229,11 @@ class RutubeMovieIE(RutubeChannelIE):          movie = self._download_json(              self._MOVIE_TEMPLATE % movie_id, movie_id,              'Downloading movie JSON') -        movie_name = movie['name'] -        return self._extract_videos(movie_id, movie_name) +        return self._extract_playlist( +            movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE):      IE_NAME = 'rutube:person'      IE_DESC = 'Rutube person videos'      _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' @@ -193,3 +246,37 @@ class RutubePersonIE(RutubeChannelIE):      }]      _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): +    IE_NAME = 'rutube:playlist' +    IE_DESC = 'Rutube playlists' +    _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', +        'info_dict': { +            'id': '3097', +        }, +        'playlist_count': 27, +    }, { +        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', +        'only_matching': True, +    }] + +    _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + +    @classmethod +    def suitable(cls, url): +        if not super(RutubePlaylistIE, cls).suitable(url): +            return False +        params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + +    def _next_page_url(self, page_num, playlist_id, item_kind): +        return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + +    def _real_extract(self, url): +        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        playlist_kind = qs['pl_type'][0] +        playlist_id = qs['pl_id'][0] +        return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index a5e672c0a..d2713c19a 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -13,11 +13,15 @@ from ..utils import (  class RUTVIE(InfoExtractor):      IE_DESC = 'RUTV.RU'      _VALID_URL = r'''(?x) -        https?://player\.(?:rutv\.ru|vgtrk\.com)/ -            (?P<path>flash\d+v/container\.swf\?id= -            |iframe/(?P<type>swf|video|live)/id/ -            |index/iframe/cast_id/) -            (?P<id>\d+)''' +                    https?:// +                        (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ +                        (?P<path> +                            flash\d+v/container\.swf\?id=| +                            iframe/(?P<type>swf|video|live)/id/| +                            index/iframe/cast_id/ +                        ) +                        (?P<id>\d+) +                    '''      _TESTS = [          { @@ -99,17 +103,21 @@ class RUTVIE(InfoExtractor):                  'skip_download': True,              },          }, +        { +            'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', +            'only_matching': True, +        },      ]      @classmethod      def _extract_url(cls, webpage):          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)          if mobj:              return mobj.group('url')          mobj = re.search( -            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', +            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',              webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/ruv.py b/youtube_dl/extractor/ruv.py new file mode 100644 index 000000000..8f3cc4095 --- /dev/null +++ b/youtube_dl/extractor/ruv.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    unified_timestamp, +) + + +class RuvIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P<id>[^/]+(?:/\d+)?)' +    _TESTS = [{ +        # m3u8 +        'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', +        'md5': '66347652f4e13e71936817102acc1724', +        'info_dict': { +            'id': '1144499', +            'display_id': 'fh-valur/20170516', +            'ext': 'mp4', +            'title': 'FH - Valur', +            'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', +            'timestamp': 1494963600, +            'upload_date': '20170516', +        }, +    }, { +        # mp3 +        'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', +        'md5': '395ea250c8a13e5fdb39d4670ef85378', +        'info_dict': { +            'id': '1153630', +            'display_id': 'morgunutvarpid/20170619', +            'ext': 'mp3', +            'title': 'Morgunútvarpið', +            'description': 'md5:a4cf1202c0a1645ca096b06525915418', +            'timestamp': 1497855000, +            'upload_date': '20170619', +        }, +    }, { +        'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', +        'only_matching': True, +    }, { +        'url': 'http://www.ruv.is/node/1151854', +        'only_matching': True, +    }, { +        'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', +        'only_matching': True, +    }, { +        'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        title = self._og_search_title(webpage) + +        FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1' + +        media_url = self._html_search_regex( +            FIELD_RE % 'src', webpage, 'video URL', group='url') + +        video_id = self._search_regex( +            r'<link\b[^>]+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', +            webpage, 'video id', default=display_id) + +        ext = determine_ext(media_url) + +        if ext == 'm3u8': +            formats = self._extract_m3u8_formats( +                media_url, video_id, 'mp4', entry_protocol='m3u8_native', +                m3u8_id='hls') +        elif ext == 'mp3': +            formats = [{ +                'format_id': 'mp3', +                'url': media_url, +                'vcodec': 'none', +            }] +        else: +            formats = [{ +                'url': media_url, +            }] + +        description = self._og_search_description(webpage, default=None) +        thumbnail = self._og_search_thumbnail( +            webpage, default=None) or self._search_regex( +            FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) +        timestamp = unified_timestamp(self._html_search_meta( +            'article:published_time', webpage, 'timestamp', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index c3aec1edd..909a6ba97 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -16,7 +16,6 @@ from ..utils import (  class SafariBaseIE(InfoExtractor):      _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' -    _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'      _NETRC_MACHINE = 'safari'      _API_BASE = 'https://www.safaribooksonline.com/api/v1' @@ -28,10 +27,6 @@ class SafariBaseIE(InfoExtractor):          self._login()      def _login(self): -        # We only need to log in once for courses or individual videos -        if self.LOGGED_IN: -            return -          (username, password) = self._get_login_info()          if username is None:              return @@ -39,11 +34,17 @@ class SafariBaseIE(InfoExtractor):          headers = std_headers.copy()          if 'Referer' not in headers:              headers['Referer'] = self._LOGIN_URL -        login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)          login_page = self._download_webpage( -            login_page_request, None, -            'Downloading login form') +            self._LOGIN_URL, None, 'Downloading login form', headers=headers) + +        def is_logged(webpage): +            return any(re.search(p, webpage) for p in ( +                r'href=["\']/accounts/logout/', r'>Sign Out<')) + +        if is_logged(login_page): +            self.LOGGED_IN = True +            return          csrf = self._html_search_regex(              r"name='csrfmiddlewaretoken'\s+value='([^']+)'", @@ -62,14 +63,12 @@ class SafariBaseIE(InfoExtractor):          login_page = self._download_webpage(              request, None, 'Logging in as %s' % username) -        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: +        if not is_logged(login_page):              raise ExtractorError(                  'Login failed; make sure your credentials are correct and try again.',                  expected=True) -        SafariBaseIE.LOGGED_IN = True - -        self.to_screen('Login successful') +        self.LOGGED_IN = True  class SafariIE(SafariBaseIE): diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py index 5e22ea730..3df51520b 100644 --- a/youtube_dl/extractor/sexu.py +++ b/youtube_dl/extractor/sexu.py @@ -32,8 +32,9 @@ class SexuIE(InfoExtractor):          formats = [{              'url': source['file'].replace('\\', ''),              'format_id': source.get('label'), -            'height': self._search_regex( -                r'^(\d+)[pP]', source.get('label', ''), 'height', default=None), +            'height': int(self._search_regex( +                r'^(\d+)[pP]', source.get('label', ''), 'height', +                default=None)),          } for source in sources if source.get('file')]          self._sort_formats(formats) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 74a1dc672..e89ebebe7 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -31,7 +31,7 @@ class SlideshareIE(InfoExtractor):          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title)          slideshare_obj = self._search_regex( -            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', +            r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',              webpage, 'slideshare object')          info = json.loads(slideshare_obj)          if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 7da12cef8..a62ed84f1 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,7 +8,11 @@ from ..compat import (      compat_str,      compat_urllib_parse_urlencode,  ) -from ..utils import ExtractorError +from ..utils import ( +    ExtractorError, +    int_or_none, +    try_get, +)  class SohuIE(InfoExtractor): @@ -169,10 +173,11 @@ class SohuIE(InfoExtractor):                  formats.append({                      'url': video_url,                      'format_id': format_id, -                    'filesize': data['clipsBytes'][i], -                    'width': data['width'], -                    'height': data['height'], -                    'fps': data['fps'], +                    'filesize': int_or_none( +                        try_get(data, lambda x: x['clipsBytes'][i])), +                    'width': int_or_none(data.get('width')), +                    'height': int_or_none(data.get('height')), +                    'fps': int_or_none(data.get('fps')),                  })              self._sort_formats(formats) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0ee4a8ff8..1c6799d57 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,8 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals -import re  import itertools +import re  from .common import (      InfoExtractor, @@ -17,6 +17,7 @@ from ..utils import (      ExtractorError,      int_or_none,      unified_strdate, +    update_url_query,  ) @@ -31,6 +32,7 @@ class SoundcloudIE(InfoExtractor):      _VALID_URL = r'''(?x)^(?:https?://)?                      (?:(?:(?:www\.|m\.)?soundcloud\.com/ +                            (?!stations/track)                              (?P<uploader>[\w\d-]+)/                              (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))                              (?P<title>[\w\d-]+)/? @@ -119,9 +121,24 @@ class SoundcloudIE(InfoExtractor):                  'license': 'cc-by-sa',              },          }, +        # private link, downloadable format +        { +            'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', +            'md5': '64a60b16e617d41d0bef032b7f55441e', +            'info_dict': { +                'id': '340344461', +                'ext': 'wav', +                'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', +                'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', +                'uploader': 'Ori Uplift Music', +                'upload_date': '20170831', +                'duration': 7449, +                'license': 'all-rights-reserved', +            }, +        },      ] -    _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z' +    _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg'      _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'      @staticmethod @@ -136,7 +153,7 @@ class SoundcloudIE(InfoExtractor):      @classmethod      def _resolv_url(cls, url): -        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID +        return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID      def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):          track_id = compat_str(info['id']) @@ -159,11 +176,13 @@ class SoundcloudIE(InfoExtractor):              'license': info.get('license'),          }          formats = [] +        query = {'client_id': self._CLIENT_ID} +        if secret_token is not None: +            query['secret_token'] = secret_token          if info.get('downloadable', False):              # We can build a direct link to the song -            format_url = ( -                'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( -                    track_id, self._CLIENT_ID)) +            format_url = update_url_query( +                'https://api.soundcloud.com/tracks/%s/download' % track_id, query)              formats.append({                  'format_id': 'download',                  'ext': info.get('original_format', 'mp3'), @@ -174,11 +193,8 @@ class SoundcloudIE(InfoExtractor):          # We have to retrieve the url          format_dict = self._download_json( -            'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id, -            track_id, 'Downloading track url', query={ -                'client_id': self._CLIENT_ID, -                'secret_token': secret_token, -            }) +            'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, +            track_id, 'Downloading track url', query=query)          for key, stream_url in format_dict.items():              abr = int_or_none(self._search_regex( @@ -215,7 +231,7 @@ class SoundcloudIE(InfoExtractor):              # cannot be always used, sometimes it can give an HTTP 404 error              formats.append({                  'format_id': 'fallback', -                'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, +                'url': update_url_query(info['stream_url'], query),                  'ext': ext,              }) @@ -236,7 +252,7 @@ class SoundcloudIE(InfoExtractor):          track_id = mobj.group('track_id')          if track_id is not None: -            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID +            info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID              full_title = track_id              token = mobj.group('secret_token')              if token: @@ -261,7 +277,7 @@ class SoundcloudIE(InfoExtractor):              self.report_resolve(full_title) -            url = 'http://soundcloud.com/%s' % resolve_title +            url = 'https://soundcloud.com/%s' % resolve_title              info_json_url = self._resolv_url(url)          info = self._download_json(info_json_url, full_title, 'Downloading info JSON') @@ -290,7 +306,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):              'id': '2284613',              'title': 'The Royal Concept EP',          }, -        'playlist_mincount': 6, +        'playlist_mincount': 5,      }, {          'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',          'only_matching': True, @@ -304,7 +320,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):          # extract simple title (uploader + slug of song title)          slug_title = mobj.group('slug_title')          full_title = '%s/sets/%s' % (uploader, slug_title) -        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) +        url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title)          token = mobj.group('token')          if token: @@ -330,7 +346,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):          } -class SoundcloudUserIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): +    _API_BASE = 'https://api.soundcloud.com' +    _API_V2_BASE = 'https://api-v2.soundcloud.com' + +    def _extract_playlist(self, base_url, playlist_id, playlist_title): +        COMMON_QUERY = { +            'limit': 50, +            'client_id': self._CLIENT_ID, +            'linked_partitioning': '1', +        } + +        query = COMMON_QUERY.copy() +        query['offset'] = 0 + +        next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + +        entries = [] +        for i in itertools.count(): +            response = self._download_json( +                next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + +            collection = response['collection'] +            if not collection: +                break + +            def resolve_permalink_url(candidates): +                for cand in candidates: +                    if isinstance(cand, dict): +                        permalink_url = cand.get('permalink_url') +                        entry_id = self._extract_id(cand) +                        if permalink_url and permalink_url.startswith('http'): +                            return permalink_url, entry_id + +            for e in collection: +                permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) +                if permalink_url: +                    entries.append(self.url_result(permalink_url, video_id=entry_id)) + +            next_href = response.get('next_href') +            if not next_href: +                break + +            parsed_next_href = compat_urlparse.urlparse(response['next_href']) +            qs = compat_urlparse.parse_qs(parsed_next_href.query) +            qs.update(COMMON_QUERY) +            next_href = compat_urlparse.urlunparse( +                parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + +        return { +            '_type': 'playlist', +            'id': playlist_id, +            'title': playlist_title, +            'entries': entries, +        } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):      _VALID_URL = r'''(?x)                          https?://                              (?:(?:www|m)\.)?soundcloud\.com/ @@ -380,21 +452,18 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):          'url': 'https://soundcloud.com/grynpyret/spotlight',          'info_dict': {              'id': '7098329', -            'title': 'GRYNPYRET (Spotlight)', +            'title': 'Grynpyret (Spotlight)',          },          'playlist_mincount': 1,      }] -    _API_BASE = 'https://api.soundcloud.com' -    _API_V2_BASE = 'https://api-v2.soundcloud.com' -      _BASE_URL_MAP = { -        'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, -        'tracks': '%s/users/%%s/tracks' % _API_BASE, -        'sets': '%s/users/%%s/playlists' % _API_V2_BASE, -        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, -        'likes': '%s/users/%%s/likes' % _API_V2_BASE, -        'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, +        'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, +        'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, +        'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, +        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, +        'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, +        'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,      }      _TITLE_MAP = { @@ -410,70 +479,49 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):          mobj = re.match(self._VALID_URL, url)          uploader = mobj.group('user') -        url = 'http://soundcloud.com/%s/' % uploader +        url = 'https://soundcloud.com/%s/' % uploader          resolv_url = self._resolv_url(url)          user = self._download_json(              resolv_url, uploader, 'Downloading user info')          resource = mobj.group('rsrc') or 'all' -        base_url = self._BASE_URL_MAP[resource] % user['id'] -        COMMON_QUERY = { -            'limit': 50, -            'client_id': self._CLIENT_ID, -            'linked_partitioning': '1', -        } +        return self._extract_playlist( +            self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), +            '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) -        query = COMMON_QUERY.copy() -        query['offset'] = 0 - -        next_href = base_url + '?' + compat_urllib_parse_urlencode(query) -        entries = [] -        for i in itertools.count(): -            response = self._download_json( -                next_href, uploader, 'Downloading track page %s' % (i + 1)) - -            collection = response['collection'] -            if not collection: -                break - -            def resolve_permalink_url(candidates): -                for cand in candidates: -                    if isinstance(cand, dict): -                        permalink_url = cand.get('permalink_url') -                        entry_id = self._extract_id(cand) -                        if permalink_url and permalink_url.startswith('http'): -                            return permalink_url, entry_id +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' +    IE_NAME = 'soundcloud:trackstation' +    _TESTS = [{ +        'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', +        'info_dict': { +            'id': '286017854', +            'title': 'Track station: your-text', +        }, +        'playlist_mincount': 47, +    }] -            for e in collection: -                permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) -                if permalink_url: -                    entries.append(self.url_result(permalink_url, video_id=entry_id)) +    def _real_extract(self, url): +        track_name = self._match_id(url) -            next_href = response.get('next_href') -            if not next_href: -                break +        webpage = self._download_webpage(url, track_name) -            parsed_next_href = compat_urlparse.urlparse(response['next_href']) -            qs = compat_urlparse.parse_qs(parsed_next_href.query) -            qs.update(COMMON_QUERY) -            next_href = compat_urlparse.urlunparse( -                parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) +        track_id = self._search_regex( +            r'soundcloud:track-stations:(\d+)', webpage, 'track id') -        return { -            '_type': 'playlist', -            'id': compat_str(user['id']), -            'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), -            'entries': entries, -        } +        return self._extract_playlist( +            '%s/stations/soundcloud:track-stations:%s/tracks' +            % (self._API_V2_BASE, track_id), +            track_id, 'Track station: %s' % track_name)  class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):      _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'      IE_NAME = 'soundcloud:playlist'      _TESTS = [{ -        'url': 'http://api.soundcloud.com/playlists/4110309', +        'url': 'https://api.soundcloud.com/playlists/4110309',          'info_dict': {              'id': '4110309',              'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index ec1b60388..84298fee4 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from .nexx import NexxEmbedIE  from .spiegeltv import SpiegeltvIE  from ..compat import compat_urlparse  from ..utils import ( @@ -121,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):          },          'playlist_count': 6, +    }, { +        # Nexx iFrame embed +        'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', +        'info_dict': { +            'id': '161464', +            'ext': 'mp4', +            'title': 'Nervenkitzel Achterbahn', +            'alt_title': 'Karussellbauer in Deutschland', +            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', +            'release_year': 2005, +            'creator': 'SPIEGEL TV', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 2761, +            'timestamp': 1394021479, +            'upload_date': '20140305', +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        },      }]      def _real_extract(self, url): @@ -143,6 +164,9 @@ class SpiegelArticleIE(InfoExtractor):          entries = [              self.url_result(compat_urlparse.urljoin(                  self.http_scheme() + '//spiegel.de/', embed_path)) -            for embed_path in embeds -        ] -        return self.playlist_result(entries) +            for embed_path in embeds] +        if embeds: +            return self.playlist_result(entries) + +        return self.playlist_from_matches( +            NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index e1cfb8698..6ccf4c342 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -1,114 +1,17 @@ -# coding: utf-8  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( -    determine_ext, -    float_or_none, -) +from .nexx import NexxIE  class SpiegeltvIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)' -    _TESTS = [{ -        'url': 'http://www.spiegel.tv/filme/flug-mh370/', -        'info_dict': { -            'id': 'flug-mh370', -            'ext': 'm4v', -            'title': 'Flug MH370', -            'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines', -            'thumbnail': r're:http://.*\.jpg$', -        }, -        'params': { -            # m3u8 download -            'skip_download': True, -        } -    }, { -        'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/', +    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',          'only_matching': True, -    }] +    }      def _real_extract(self, url): -        if '/#/' in url: -            url = url.replace('/#/', '/') -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title') - -        apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com' -        version_json = self._download_json( -            '%s/version.json' % apihost, video_id, -            note='Downloading version information') -        version_name = version_json['version_name'] - -        slug_json = self._download_json( -            '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id), -            video_id, -            note='Downloading object information') -        oid = slug_json['object_id'] - -        media_json = self._download_json( -            '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid), -            video_id, note='Downloading media information') -        uuid = media_json['uuid'] -        is_wide = media_json['is_wide'] - -        server_json = self._download_json( -            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', -            video_id, note='Downloading server information') - -        format = '16x9' if is_wide else '4x3' - -        formats = [] -        for streamingserver in server_json['streamingserver']: -            endpoint = streamingserver.get('endpoint') -            if not endpoint: -                continue -            play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format) -            if endpoint.startswith('rtmp'): -                formats.append({ -                    'url': endpoint, -                    'format_id': 'rtmp', -                    'app': compat_urllib_parse_urlparse(endpoint).path[1:], -                    'play_path': play_path, -                    'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf', -                    'ext': 'flv', -                    'rtmp_live': True, -                }) -            elif determine_ext(endpoint) == 'm3u8': -                formats.append({ -                    'url': endpoint.replace('[video]', play_path), -                    'ext': 'm4v', -                    'format_id': 'hls',  # Prefer hls since it allows to workaround georestriction -                    'protocol': 'm3u8', -                    'preference': 1, -                    'http_headers': { -                        'Accept-Encoding': 'deflate',  # gzip causes trouble on the server side -                    }, -                }) -            else: -                formats.append({ -                    'url': endpoint, -                }) -        self._check_formats(formats, video_id) - -        thumbnails = [] -        for image in media_json['images']: -            thumbnails.append({ -                'url': image['url'], -                'width': image['width'], -                'height': image['height'], -            }) - -        description = media_json['subtitle'] -        duration = float_or_none(media_json.get('duration_in_ms'), scale=1000) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'duration': duration, -            'thumbnails': thumbnails, -            'formats': formats, -        } +        return self.url_result( +            'https://api.nexx.cloud/v3/748/videos/byid/%s' +            % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index e7bd5bf91..54497c880 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( +    determine_ext, +    int_or_none, +    js_to_json, +)  class SportBoxEmbedIE(InfoExtractor): @@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor):          'info_dict': {              'id': '211355',              'ext': 'mp4', -            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', +            'title': '211355',              'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 292, +            'view_count': int,          },          'params': {              # m3u8 download @@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor):      }, {          'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',          'only_matching': True, +    }, { +        'url': 'https://news.sportbox.ru/vdl/player/media/193095', +        'only_matching': True,      }]      @staticmethod @@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        formats = [] - -        def cleanup_js(code): -            # desktop_advert_config contains complex Javascripts and we don't need it -            return js_to_json(re.sub(r'desktop_advert_config.*', '', code)) - -        jwplayer_data = self._parse_json(self._search_regex( -            r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id, -            transform_source=cleanup_js) - -        hls_url = jwplayer_data.get('hls_url') -        if hls_url: -            formats.extend(self._extract_m3u8_formats( -                hls_url, video_id, ext='mp4', m3u8_id='hls')) - -        rtsp_url = jwplayer_data.get('rtsp_url') -        if rtsp_url: -            formats.append({ -                'url': rtsp_url, -                'format_id': 'rtsp', -            }) +        wjplayer_data = self._parse_json( +            self._search_regex( +                r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'), +            video_id, transform_source=js_to_json) +        formats = [] +        for source in wjplayer_data['sources']: +            src = source.get('src') +            if not src: +                continue +            if determine_ext(src) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    src, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            else: +                formats.append({ +                    'url': src, +                })          self._sort_formats(formats) -        title = jwplayer_data['node_title'] -        thumbnail = jwplayer_data.get('image_url') +        view_count = int_or_none(self._search_regex( +            r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))          return {              'id': video_id, -            'title': title, -            'thumbnail': thumbnail, +            'title': video_id, +            'thumbnail': wjplayer_data.get('poster'), +            'duration': int_or_none(wjplayer_data.get('duration')), +            'view_count': view_count,              'formats': formats,          } diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index aa4fad162..a9e34c027 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -22,6 +22,17 @@ class StreamangoIE(InfoExtractor):              'title': '20170315_150006.mp4',          }      }, { +        # no og:title +        'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', +        'info_dict': { +            'id': 'foqebrpftarclpob', +            'ext': 'mp4', +            'title': 'foqebrpftarclpob', +        }, +        'params': { +            'skip_download': True, +        }, +    }, {          'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',          'only_matching': True,      }] @@ -31,7 +42,7 @@ class StreamangoIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        title = self._og_search_title(webpage) +        title = self._og_search_title(webpage, default=video_id)          formats = []          for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 9e533103c..58e0b4c80 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', -        'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', +        'md5': '934bb6a6d220d99c010783c9719960d5',          'info_dict': {              'id': '765767',              'ext': 'mp4', @@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor):          },      }, {          'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', -        'md5': 'e54a254fb8b871968fd8403255f28589', +        'md5': '849a88c1e1ca47d41403c2ba5e59e261',          'info_dict': {              'id': '10002447',              'ext': 'mp4', @@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor):          else:              title = data['name'] +        subtitles = {} +        srt_url = data.get('subtitles_srt') +        if srt_url: +            subtitles['cs'] = [{ +                'ext': 'srt', +                'url': srt_url, +            }] +          return {              'id': video_id,              'title': title, @@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor):              'description': data.get('web_site_text'),              'duration': int_or_none(data.get('duration')),              'view_count': int_or_none(data.get('views')), +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 1b5afb73e..48bc4529e 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE):          if video_id:              data = self._download_json( -                'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) +                'https://api.svt.se/videoplayer-api/video/%s' % video_id, +                video_id, headers=self.geo_verification_headers())              info_dict = self._extract_video(data, video_id)              if not info_dict.get('title'):                  info_dict['title'] = re.sub( diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py new file mode 100644 index 000000000..7fe96bd5f --- /dev/null +++ b/youtube_dl/extractor/tastytrade.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TastyTradeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', +        'info_dict': { +            'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', +            'ext': 'mp4', +            'title': 'A History of Teaming', +            'description': 'md5:2a9033db8da81f2edffa4c99888140b3', +            'duration': 422.255, +        }, +        'params': { +            'skip_download': True, +        }, +        'add_ie': ['Ooyala'], +    }, { +        'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        ooyala_code = self._search_regex( +            r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1', +            webpage, 'ooyala code', group='code') + +        info = self._search_json_ld(webpage, display_id, fatal=False) +        info.update({ +            '_type': 'url_transparent', +            'ie_key': OoyalaIE.ie_key(), +            'url': 'ooyala:%s' % ooyala_code, +            'display_id': display_id, +        }) +        return info diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index bf93eb868..e9474533f 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -8,6 +8,9 @@ from ..utils import extract_attributes  class TBSIE(TurnerBaseIE): +    # https://github.com/rg3/youtube-dl/issues/13658 +    _WORKING = False +      _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'      _TESTS = [{          'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', @@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):              'ext': 'mp4',              'title': 'Theatrical Trailer',              'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', -        } +        }, +        'skip': 'TBS videos are deleted after a while',      }, {          'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',          'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', @@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):              'ext': 'mp4',              'title': 'You Better Run',              'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', -        } +        }, +        'skip': 'TBS videos are deleted after a while',      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py deleted file mode 100644 index a8c6ed7be..000000000 --- a/youtube_dl/extractor/teamfourstar.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from ..utils import unified_strdate - - -class TeamFourStarIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)' -    _TEST = { -        'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/', -        'info_dict': { -            'id': '0WdZO31W', -            'title': 'TFS Abridged Parody Episode 1', -            'description': 'md5:d60bc389588ebab2ee7ad432bda953ae', -            'ext': 'mp4', -            'timestamp': 1394168400, -            'upload_date': '20080508', -        }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) - -        jwplatform_url = JWPlatformIE._extract_url(webpage) - -        video_title = self._html_search_regex( -            r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>', -            webpage, 'title') -        video_date = unified_strdate(self._html_search_regex( -            r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>', -            webpage, 'date', fatal=False)) -        video_description = self._html_search_regex( -            r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>', -            webpage, 'description', fatal=False) -        video_thumbnail = self._og_search_thumbnail(webpage) - -        return { -            '_type': 'url_transparent', -            'display_id': display_id, -            'title': video_title, -            'description': video_description, -            'upload_date': video_date, -            'thumbnail': video_thumbnail, -            'url': jwplatform_url, -        } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 3f3c681ae..06a27fd04 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,7 +6,10 @@ import re  from .common import InfoExtractor  from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    try_get, +)  class TEDIE(InfoExtractor): @@ -113,8 +116,9 @@ class TEDIE(InfoExtractor):      }      def _extract_info(self, webpage): -        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', -                                       webpage, 'info json') +        info_json = self._search_regex( +            r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', +            webpage, 'info json')          return json.loads(info_json)      def _real_extract(self, url): @@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):          webpage = self._download_webpage(url, name,                                           'Downloading playlist webpage')          info = self._extract_info(webpage) -        playlist_info = info['playlist'] + +        playlist_info = try_get( +            info, lambda x: x['__INITIAL_DATA__']['playlist'], +            dict) or info['playlist']          playlist_entries = [              self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) -            for talk in info['talks'] +            for talk in try_get( +                info, lambda x: x['__INITIAL_DATA__']['talks'], +                dict) or info['talks']          ]          return self.playlist_result(              playlist_entries, @@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):      def _talk_info(self, url, video_name):          webpage = self._download_webpage(url, video_name) -        self.report_extraction(video_name) -        talk_info = self._extract_info(webpage)['talks'][0] +        info = self._extract_info(webpage) + +        talk_info = try_get( +            info, lambda x: x['__INITIAL_DATA__']['talks'][0], +            dict) or info['talks'][0] + +        title = talk_info['title'].strip()          external = talk_info.get('external')          if external: @@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):                  'url': ext_url or external['uri'],              } +        native_downloads = try_get( +            talk_info, lambda x: x['downloads']['nativeDownloads'], +            dict) or talk_info['nativeDownloads'] +          formats = [{              'url': format_url,              'format_id': format_id,              'format': format_id, -        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] +        } for (format_id, format_url) in native_downloads.items() if format_url is not None]          if formats:              for f in formats:                  finfo = self._NATIVE_FORMATS.get(f['format_id'])                  if finfo:                      f.update(finfo) +        player_talk = talk_info['player_talks'][0] + +        resources_ = player_talk.get('resources') or talk_info.get('resources') +          http_url = None -        for format_id, resources in talk_info['resources'].items(): +        for format_id, resources in resources_.items():              if format_id == 'h264':                  for resource in resources:                      h264_url = resource.get('file') @@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):          video_id = compat_str(talk_info['id']) -        thumbnail = talk_info['thumb'] -        if not thumbnail.startswith('http'): -            thumbnail = 'http://' + thumbnail          return {              'id': video_id, -            'title': talk_info['title'].strip(), -            'uploader': talk_info['speaker'], -            'thumbnail': thumbnail, +            'title': title, +            'uploader': player_talk.get('speaker') or talk_info.get('speaker'), +            'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),              'description': self._og_search_description(webpage),              'subtitles': self._get_subtitles(video_id, talk_info),              'formats': formats, @@ -252,20 +271,22 @@ class TEDIE(InfoExtractor):          }      def _get_subtitles(self, video_id, talk_info): -        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] -        if languages: -            sub_lang_list = {} -            for l in languages: -                sub_lang_list[l] = [ -                    { -                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), -                        'ext': ext, -                    } -                    for ext in ['ted', 'srt'] -                ] -            return sub_lang_list -        else: -            return {} +        sub_lang_list = {} +        for language in try_get( +                talk_info, +                (lambda x: x['downloads']['languages'], +                 lambda x: x['languages']), list): +            lang_code = language.get('languageCode') or language.get('ianaCode') +            if not lang_code: +                continue +            sub_lang_list[lang_code] = [ +                { +                    'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), +                    'ext': ext, +                } +                for ext in ['ted', 'srt'] +            ] +        return sub_lang_list      def _watch_info(self, url, name):          webpage = self._download_webpage(url, name) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a424b1c6..de236bbba 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):                      'url': src,                  }) +        duration = info.get('duration') +        tp_chapters = info.get('chapters', []) +        chapters = [] +        if tp_chapters: +            def _add_chapter(start_time, end_time): +                start_time = float_or_none(start_time, 1000) +                end_time = float_or_none(end_time, 1000) +                if start_time is None or end_time is None: +                    return +                chapters.append({ +                    'start_time': start_time, +                    'end_time': end_time, +                }) + +            for chapter in tp_chapters[:-1]: +                _add_chapter(chapter.get('startTime'), chapter.get('endTime')) +            _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) +          return {              'title': info['title'],              'subtitles': subtitles,              'description': info['description'],              'thumbnail': info['defaultThumbnailUrl'], -            'duration': int_or_none(info.get('duration'), 1000), +            'duration': float_or_none(duration, 1000),              'timestamp': int_or_none(info.get('pubDate'), 1000) or None,              'uploader': info.get('billingCode'), +            'chapters': chapters,          }      def _extract_theplatform_metadata(self, path, video_id): diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index b8504f0eb..cd642355c 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..compat import compat_urlparse -from ..utils import ( -    int_or_none, -    qualities, -)  class TheSceneIE(InfoExtractor): @@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor):              'season': 'Ready To Wear Spring 2013',              'tags': list,              'categories': list, +            'upload_date': '20120913', +            'timestamp': 1347512400, +            'uploader': 'vogue',          },      } @@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor):              self._html_search_regex(                  r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) -        player = self._download_webpage(player_url, display_id) -        info = self._parse_json( -            self._search_regex( -                r'(?m)video\s*:\s*({.+?}),$', player, 'info json'), -            display_id) - -        video_id = info['id'] -        title = info['title'] - -        qualities_order = qualities(('low', 'high')) -        formats = [{ -            'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), -            'url': f['src'], -            'quality': qualities_order(f['quality']), -        } for f in info['sources']] -        self._sort_formats(formats) -          return { -            'id': video_id, +            '_type': 'url_transparent',              'display_id': display_id, -            'title': title, -            'formats': formats, -            'thumbnail': info.get('poster_frame'), -            'duration': int_or_none(info.get('duration')), -            'series': info.get('series_title'), -            'season': info.get('season_title'), -            'tags': info.get('tags'), -            'categories': info.get('categories'), +            'url': player_url, +            'ie_key': 'CondeNast',          } diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index 197258df1..6ab147ad7 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -2,13 +2,15 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get  class ThisOldHouseIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'      _TESTS = [{          'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', -        'md5': '946f05bbaa12a33f9ae35580d2dfcfe3', +        'md5': '568acf9ca25a639f0c4ff905826b662f',          'info_dict': {              'id': '2REGtUDQ',              'ext': 'mp4', @@ -28,8 +30,15 @@ class ThisOldHouseIE(InfoExtractor):      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        drupal_settings = self._parse_json(self._search_regex( -            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', -            webpage, 'drupal settings'), display_id) -        video_id = drupal_settings['jwplatform']['video_id'] +        video_id = self._search_regex( +            (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', +             r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'), +            webpage, 'video id', default=None, group='id') +        if not video_id: +            drupal_settings = self._parse_json(self._search_regex( +                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', +                webpage, 'drupal settings'), display_id) +            video_id = try_get( +                drupal_settings, lambda x: x['jwplatform']['video_id'], +                compat_str) or list(drupal_settings['comScore'])[0]          return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index c54b876d3..348d6ecdf 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,7 +17,7 @@ from ..utils import (  class ToggleIE(InfoExtractor):      IE_NAME = 'toggle' -    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' +    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'      _TESTS = [{          'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',          'info_dict': { @@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor):      }, {          'url': 'http://video.toggle.sg/en/movies/seven-days/321936',          'only_matching': True, +    }, { +        'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', +        'only_matching': True, +    }, { +        'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', +        'only_matching': True,      }]      _FORMAT_PREFERENCES = { diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 26d770992..e59ed2661 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -5,7 +5,6 @@ from .common import InfoExtractor  from ..utils import (      int_or_none,      js_to_json, -    ExtractorError,      urlencode_postdata,      extract_attributes,      smuggle_url, @@ -78,8 +77,10 @@ class TouTvIE(InfoExtractor):      def _real_extract(self, url):          path = self._match_id(url)          metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) +        # IsDrm does not necessarily mean the video is DRM protected (see +        # https://github.com/rg3/youtube-dl/issues/13994).          if metadata.get('IsDrm'): -            raise ExtractorError('This video is DRM protected.', expected=True) +            self.report_warning('This video is probably DRM protected.', path)          video_id = metadata['IdMedia']          details = metadata['Details']          title = details['OriginalTitle'] diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 938e05076..f705a06c9 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -6,42 +6,48 @@ import re  class ToypicsIE(InfoExtractor): -    IE_DESC = 'Toypics user profile' -    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' +    IE_DESC = 'Toypics video' +    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',          'md5': '16e806ad6d6f58079d210fe30985e08b',          'info_dict': {              'id': '514',              'ext': 'mp4', -            'title': 'Chance-Bulge\'d, 2', +            'title': "Chance-Bulge'd, 2",              'age_limit': 18,              'uploader': 'kidsune',          }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        page = self._download_webpage(url, video_id) -        video_url = self._html_search_regex( -            r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') -        title = self._html_search_regex( -            r'<title>Toypics - ([^<]+)</title>', page, 'title') -        username = self._html_search_regex( -            r'toypics.net/([^/"]+)" class="user-name">', page, 'username') +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        formats = self._parse_html5_media_entries( +            url, webpage, video_id)[0]['formats'] +        title = self._html_search_regex([ +            r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h', +            r'<title>([^<]+) - Toypics</title>', +        ], webpage, 'title') + +        uploader = self._html_search_regex( +            r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader', +            fatal=False) +          return {              'id': video_id, -            'url': video_url, +            'formats': formats,              'title': title, -            'uploader': username, +            'uploader': uploader,              'age_limit': 18,          }  class ToypicsUserIE(InfoExtractor):      IE_DESC = 'Toypics user profile' -    _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' +    _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)'      _TEST = {          'url': 'http://videos.toypics.net/Mikey',          'info_dict': { @@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        username = mobj.group('username') +        username = self._match_id(url)          profile_page = self._download_webpage(              url, username, note='Retrieving profile page') @@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor):                  note='Downloading page %d/%d' % (n, page_count))              urls.extend(                  re.findall( -                    r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">', +                    r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"',                      lpage))          return { diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 2aae55e7e..7421378a8 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -3,138 +3,6 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( -    ExtractorError, -    int_or_none, -    InAdvancePagedList, -    float_or_none, -    unescapeHTML, -) - - -class TudouIE(InfoExtractor): -    IE_NAME = 'tudou' -    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})' -    _TESTS = [{ -        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', -        'md5': '140a49ed444bd22f93330985d8475fcb', -        'info_dict': { -            'id': '159448201', -            'ext': 'f4v', -            'title': '卡马乔国足开大脚长传冲吊集锦', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1372113489000, -            'description': '卡马乔卡家军,开大脚先进战术不完全集锦!', -            'duration': 289.04, -            'view_count': int, -            'filesize': int, -        } -    }, { -        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/', -        'info_dict': { -            'id': '117049447', -            'ext': 'f4v', -            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1349207518000, -            'description': 'md5:294612423894260f2dcd5c6c04fe248b', -            'duration': 5478.33, -            'view_count': int, -            'filesize': int, -        } -    }] - -    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - -    # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf -    # 0001, 0002 and 4001 are not included as they indicate temporary issues -    TVC_ERRORS = { -        '0003': 'The video is deleted or does not exist', -        '1001': 'This video is unavailable due to licensing issues', -        '1002': 'This video is unavailable as it\'s under review', -        '1003': 'This video is unavailable as it\'s under review', -        '3001': 'Password required', -        '5001': 'This video is available in Mainland China only due to licensing issues', -        '7001': 'This video is unavailable', -        '8001': 'This video is unavailable due to licensing issues', -    } - -    def _url_for_id(self, video_id, quality=None): -        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) -        if quality: -            info_url += '&hd' + quality -        xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page') -        error = xml_data.attrib.get('error') -        if error is not None: -            raise ExtractorError('Tudou said: %s' % error, expected=True) -        final_url = xml_data.text -        return final_url - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        item_data = self._download_json( -            'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id) - -        youku_vcode = item_data.get('vcode') -        if youku_vcode: -            return self.url_result('youku:' + youku_vcode, ie='Youku') - -        if not item_data.get('itemSegs'): -            tvc_code = item_data.get('tvcCode') -            if tvc_code: -                err_msg = self.TVC_ERRORS.get(tvc_code) -                if err_msg: -                    raise ExtractorError('Tudou said: %s' % err_msg, expected=True) -                raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code) -            raise ExtractorError('Unxpected error returned from Tudou') - -        title = unescapeHTML(item_data['kw']) -        description = item_data.get('desc') -        thumbnail_url = item_data.get('pic') -        view_count = int_or_none(item_data.get('playTimes')) -        timestamp = int_or_none(item_data.get('pt')) - -        segments = self._parse_json(item_data['itemSegs'], video_id) -        # It looks like the keys are the arguments that have to be passed as -        # the hd field in the request url, we pick the higher -        # Also, filter non-number qualities (see issue #3643). -        quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), -                         key=lambda k: int(k))[-1] -        parts = segments[quality] -        len_parts = len(parts) -        if len_parts > 1: -            self.to_screen('%s: found %s parts' % (video_id, len_parts)) - -        def part_func(partnum): -            part = parts[partnum] -            part_id = part['k'] -            final_url = self._url_for_id(part_id, quality) -            ext = (final_url.split('?')[0]).split('.')[-1] -            return [{ -                'id': '%s' % part_id, -                'url': final_url, -                'ext': ext, -                'title': title, -                'thumbnail': thumbnail_url, -                'description': description, -                'view_count': view_count, -                'timestamp': timestamp, -                'duration': float_or_none(part.get('seconds'), 1000), -                'filesize': int_or_none(part.get('size')), -                'http_headers': { -                    'Referer': self._PLAYER_URL, -                }, -            }] - -        entries = InAdvancePagedList(part_func, len_parts, 1) - -        return { -            '_type': 'multi_video', -            'entries': entries, -            'id': video_id, -            'title': title, -        }  class TudouPlaylistIE(InfoExtractor): diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py index 25aa9c58e..be3eaa5c2 100644 --- a/youtube_dl/extractor/turbo.py +++ b/youtube_dl/extractor/turbo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      ExtractorError,      int_or_none, @@ -49,7 +50,7 @@ class TurboIE(InfoExtractor):          for child in item:              m = re.search(r'url_video_(?P<quality>.+)', child.tag)              if m: -                quality = m.group('quality') +                quality = compat_str(m.group('quality'))                  formats.append({                      'format_id': quality,                      'url': child.text, diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 1c0be9fc6..efeb677ee 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -13,6 +13,7 @@ from ..utils import (      xpath_attr,      update_url_query,      ExtractorError, +    strip_or_none,  ) @@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE):              'height': int_or_none(image.get('height')),          } for image in video_data.findall('images/image')] +        is_live = xpath_text(video_data, 'isLive') == 'true' +          return {              'id': video_id, -            'title': title, +            'title': self._live_title(title) if is_live else title,              'formats': formats,              'subtitles': subtitles,              'thumbnails': thumbnails, -            'description': xpath_text(video_data, 'description'), +            'thumbnail': xpath_text(video_data, 'poster'), +            'description': strip_or_none(xpath_text(video_data, 'description')),              'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),              'timestamp': self._extract_timestamp(video_data),              'upload_date': xpath_attr(video_data, 'metas', 'version'),              'series': xpath_text(video_data, 'showTitle'),              'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),              'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), +            'is_live': is_live,          } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c620..cfcce020a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor):              tv4\.se/(?:[^/]+)/klipp/(?:.*)-|              tv4play\.se/              (?: -                (?:program|barn)/(?:[^\?]+)\?video_id=| +                (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)|                  iframe/video/|                  film/|                  sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor):              'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412',              'only_matching': True,          }, +        { +            'url': 'http://www.tv4play.se/program/farang/3922081', +            'only_matching': True, +        }      ]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index ebde6053f..8f8686a65 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -48,7 +48,7 @@ class TVPlayerIE(InfoExtractor):              'https://tvplayer.com/watch/context', display_id,              'Downloading JSON context', query={                  'resource': resource_id, -                'nonce': token, +                'gen': token,              })          validate = context['validate'] diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 4fd1aa4bf..a42977f39 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor):      @staticmethod      def _extract_urls(webpage):          return [m.group('url') for m in re.finditer( -            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', +            r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',              webpage)]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2daf9dfac..c926c99a9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import (  class TwitchBaseIE(InfoExtractor): -    _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' +    _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv'      _API_BASE = 'https://api.twitch.tv'      _USHER_BASE = 'https://usher.ttvnw.net' @@ -217,7 +217,7 @@ class TwitchVodIE(TwitchItemBaseIE):      _VALID_URL = r'''(?x)                      https?://                          (?: -                            (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| +                            (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/|                              player\.twitch\.tv/\?.*?\bvideo=v                          )                          (?P<id>\d+) @@ -458,7 +458,7 @@ class TwitchStreamIE(TwitchBaseIE):      _VALID_URL = r'''(?x)                      https?://                          (?: -                            (?:www\.)?twitch\.tv/| +                            (?:(?:www|go)\.)?twitch\.tv/|                              player\.twitch\.tv/\?.*?\bchannel=                          )                          (?P<id>[^/#?]+) @@ -489,6 +489,9 @@ class TwitchStreamIE(TwitchBaseIE):      }, {          'url': 'https://player.twitch.tv/?channel=lotsofs',          'only_matching': True, +    }, { +        'url': 'https://go.twitch.tv/food', +        'only_matching': True,      }]      @classmethod diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 37e3bc412..6eaf360a6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -7,20 +7,38 @@ from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import (      determine_ext, +    dict_get, +    ExtractorError,      float_or_none, -    xpath_text, -    remove_end,      int_or_none, -    ExtractorError, +    remove_end, +    try_get, +    xpath_text,  )  from .periscope import PeriscopeIE  class TwitterBaseIE(InfoExtractor): -    def _get_vmap_video_url(self, vmap_url, video_id): +    def _extract_formats_from_vmap_url(self, vmap_url, video_id):          vmap_data = self._download_xml(vmap_url, video_id) -        return xpath_text(vmap_data, './/MediaFile').strip() +        video_url = xpath_text(vmap_data, './/MediaFile').strip() +        if determine_ext(video_url) == 'm3u8': +            return self._extract_m3u8_formats( +                video_url, video_id, ext='mp4', m3u8_id='hls', +                entry_protocol='m3u8_native') +        return [{ +            'url': video_url, +        }] + +    @staticmethod +    def _search_dimensions_in_video_url(a_format, video_url): +        m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) +        if m: +            a_format.update({ +                'width': int(m.group('width')), +                'height': int(m.group('height')), +            })  class TwitterCardIE(TwitterBaseIE): @@ -36,7 +54,8 @@ class TwitterCardIE(TwitterBaseIE):                  'title': 'Twitter Card',                  'thumbnail': r're:^https?://.*\.jpg$',                  'duration': 30.033, -            } +            }, +            'skip': 'Video gone',          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -48,6 +67,7 @@ class TwitterCardIE(TwitterBaseIE):                  'thumbnail': r're:^https?://.*\.jpg',                  'duration': 80.155,              }, +            'skip': 'Video gone',          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', @@ -65,7 +85,7 @@ class TwitterCardIE(TwitterBaseIE):          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', -            'md5': 'ab2745d0b0ce53319a534fccaa986439', +            'md5': '6dabeaca9e68cbb71c99c322a4b42a11',              'info_dict': {                  'id': 'iBb2x00UVlv',                  'ext': 'mp4', @@ -73,16 +93,17 @@ class TwitterCardIE(TwitterBaseIE):                  'uploader_id': '1189339351084113920',                  'uploader': 'ArsenalTerje',                  'title': 'Vine by ArsenalTerje', +                'timestamp': 1447451307,              },              'add_ie': ['Vine'],          }, {              'url': 'https://twitter.com/i/videos/tweet/705235433198714880', -            'md5': '3846d0a07109b5ab622425449b59049d', +            'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',              'info_dict': {                  'id': '705235433198714880',                  'ext': 'mp4',                  'title': 'Twitter web player', -                'thumbnail': r're:^https?://.*\.jpg', +                'thumbnail': r're:^https?://.*',              },          }, {              'url': 'https://twitter.com/i/videos/752274308186120192', @@ -90,6 +111,59 @@ class TwitterCardIE(TwitterBaseIE):          },      ] +    def _parse_media_info(self, media_info, video_id): +        formats = [] +        for media_variant in media_info.get('variants', []): +            media_url = media_variant['url'] +            if media_url.endswith('.m3u8'): +                formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) +            elif media_url.endswith('.mpd'): +                formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) +            else: +                vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) +                a_format = { +                    'url': media_url, +                    'format_id': 'http-%d' % vbr if vbr else 'http', +                    'vbr': vbr, +                } +                # Reported bitRate may be zero +                if not a_format['vbr']: +                    del a_format['vbr'] + +                self._search_dimensions_in_video_url(a_format, media_url) + +                formats.append(a_format) +        return formats + +    def _extract_mobile_formats(self, username, video_id): +        webpage = self._download_webpage( +            'https://mobile.twitter.com/%s/status/%s' % (username, video_id), +            video_id, 'Downloading mobile webpage', +            headers={ +                # A recent mobile UA is necessary for `gt` cookie +                'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', +            }) +        main_script_url = self._html_search_regex( +            r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') +        main_script = self._download_webpage( +            main_script_url, video_id, 'Downloading main script') +        bearer_token = self._search_regex( +            r'BEARER_TOKEN\s*:\s*"([^"]+)"', +            main_script, 'bearer token') +        guest_token = self._search_regex( +            r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)', +            webpage, 'guest token') +        api_data = self._download_json( +            'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id, +            video_id, 'Downloading mobile API data', +            headers={ +                'Authorization': 'Bearer ' + bearer_token, +                'x-guest-token': guest_token, +            }) +        media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id] +                                                  ['extended_entities']['media'][0]['video_info']) or {} +        return self._parse_media_info(media_info, video_id) +      def _real_extract(self, url):          video_id = self._match_id(url) @@ -117,14 +191,6 @@ class TwitterCardIE(TwitterBaseIE):          if periscope_url:              return self.url_result(periscope_url, PeriscopeIE.ie_key()) -        def _search_dimensions_in_video_url(a_format, video_url): -            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) -            if m: -                a_format.update({ -                    'width': int(m.group('width')), -                    'height': int(m.group('height')), -                }) -          video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')          if video_url: @@ -135,15 +201,14 @@ class TwitterCardIE(TwitterBaseIE):                      'url': video_url,                  } -                _search_dimensions_in_video_url(f, video_url) +                self._search_dimensions_in_video_url(f, video_url)                  formats.append(f)          vmap_url = config.get('vmapUrl') or config.get('vmap_url')          if vmap_url: -            formats.append({ -                'url': self._get_vmap_video_url(vmap_url, video_id), -            }) +            formats.extend( +                self._extract_formats_from_vmap_url(vmap_url, video_id))          media_info = None @@ -152,29 +217,14 @@ class TwitterCardIE(TwitterBaseIE):                  media_info = entity['mediaInfo']          if media_info: -            for media_variant in media_info['variants']: -                media_url = media_variant['url'] -                if media_url.endswith('.m3u8'): -                    formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) -                elif media_url.endswith('.mpd'): -                    formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) -                else: -                    vbr = int_or_none(media_variant.get('bitRate'), scale=1000) -                    a_format = { -                        'url': media_url, -                        'format_id': 'http-%d' % vbr if vbr else 'http', -                        'vbr': vbr, -                    } -                    # Reported bitRate may be zero -                    if not a_format['vbr']: -                        del a_format['vbr'] - -                    _search_dimensions_in_video_url(a_format, media_url) - -                    formats.append(a_format) - +            formats.extend(self._parse_media_info(media_info, video_id))              duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) +        username = config.get('user', {}).get('screen_name') +        if username: +            formats.extend(self._extract_mobile_formats(username, video_id)) + +        self._remove_duplicate_formats(formats)          self._sort_formats(formats)          title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') @@ -255,10 +305,10 @@ class TwitterIE(InfoExtractor):          'info_dict': {              'id': '700207533655363584',              'ext': 'mp4', -            'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', -            'description': 'JG on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', +            'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel', +            'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',              'thumbnail': r're:^https?://.*\.jpg', -            'uploader': 'JG', +            'uploader': 'Donte',              'uploader_id': 'jaydingeer',          },          'params': { @@ -270,9 +320,11 @@ class TwitterIE(InfoExtractor):          'info_dict': {              'id': 'MIOxnrUteUd',              'ext': 'mp4', -            'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', -            'uploader': 'TAKUMA', -            'uploader_id': '1004126642786242560', +            'title': 'FilmDrunk - Vine of the day', +            'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', +            'uploader': 'FilmDrunk', +            'uploader_id': 'Filmdrunk', +            'timestamp': 1402826626,              'upload_date': '20140615',          },          'add_ie': ['Vine'], @@ -294,13 +346,28 @@ class TwitterIE(InfoExtractor):          'info_dict': {              'id': '1zqKVVlkqLaKB',              'ext': 'mp4', -            'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', +            'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', +            'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence  https://t.co/EKrVgIXF3s"',              'upload_date': '20160923',              'uploader_id': 'OPP_HSD', -            'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', +            'uploader': 'Sgt Kerry Schmidt',              'timestamp': 1474613214,          },          'add_ie': ['Periscope'], +    }, { +        # has mp4 formats via mobile API +        'url': 'https://twitter.com/news_al3alm/status/852138619213144067', +        'info_dict': { +            'id': '852138619213144067', +            'ext': 'mp4', +            'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', +            'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة   https://t.co/xg6OhpyKfN"', +            'uploader': 'عالم الأخبار', +            'uploader_id': 'news_al3alm', +        }, +        'params': { +            'format': 'best[format_id^=http-]', +        },      }]      def _real_extract(self, url): @@ -393,7 +460,7 @@ class TwitterAmplifyIE(TwitterBaseIE):          vmap_url = self._html_search_meta(              'twitter:amplify:vmap', webpage, 'vmap url') -        video_url = self._get_vmap_video_url(vmap_url, video_id) +        formats = self._extract_formats_from_vmap_url(vmap_url, video_id)          thumbnails = []          thumbnail = self._html_search_meta( @@ -415,11 +482,10 @@ class TwitterAmplifyIE(TwitterBaseIE):              })          video_w, video_h = _find_dimension('player') -        formats = [{ -            'url': video_url, +        formats[0].update({              'width': video_w,              'height': video_h, -        }] +        })          return {              'id': video_id, diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index dae1aa3c6..207c4a6a7 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -15,6 +15,7 @@ from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    js_to_json,      sanitized_Request,      unescapeHTML,      urlencode_postdata, @@ -52,6 +53,10 @@ class UdemyIE(InfoExtractor):          # new URL schema          'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906',          'only_matching': True, +    }, { +        # no url in outputs format entry +        'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', +        'only_matching': True,      }]      def _extract_course_info(self, webpage, video_id): @@ -69,7 +74,7 @@ class UdemyIE(InfoExtractor):              return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url          checkout_url = unescapeHTML(self._search_regex( -            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', +            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1',              webpage, 'checkout url', group='url', default=None))          if checkout_url:              raise ExtractorError( @@ -219,7 +224,7 @@ class UdemyIE(InfoExtractor):          def extract_output_format(src, f_id):              return { -                'url': src['url'], +                'url': src.get('url'),                  'format_id': '%sp' % (src.get('height') or f_id),                  'width': int_or_none(src.get('width')),                  'height': int_or_none(src.get('height')), @@ -264,6 +269,25 @@ class UdemyIE(InfoExtractor):                      f = add_output_format_meta(f, format_id)                  formats.append(f) +        def extract_subtitles(track_list): +            if not isinstance(track_list, list): +                return +            for track in track_list: +                if not isinstance(track, dict): +                    continue +                if track.get('kind') != 'captions': +                    continue +                src = track.get('src') +                if not src or not isinstance(src, compat_str): +                    continue +                lang = track.get('language') or track.get( +                    'srclang') or track.get('label') +                sub_dict = automatic_captions if track.get( +                    'autogenerated') is True else subtitles +                sub_dict.setdefault(lang, []).append({ +                    'url': src, +                }) +          download_urls = asset.get('download_urls')          if isinstance(download_urls, dict):              extract_formats(download_urls.get('Video')) @@ -311,23 +335,16 @@ class UdemyIE(InfoExtractor):                  extract_formats(data.get('sources'))                  if not duration:                      duration = int_or_none(data.get('duration')) -                tracks = data.get('tracks') -                if isinstance(tracks, list): -                    for track in tracks: -                        if not isinstance(track, dict): -                            continue -                        if track.get('kind') != 'captions': -                            continue -                        src = track.get('src') -                        if not src or not isinstance(src, compat_str): -                            continue -                        lang = track.get('language') or track.get( -                            'srclang') or track.get('label') -                        sub_dict = automatic_captions if track.get( -                            'autogenerated') is True else subtitles -                        sub_dict.setdefault(lang, []).append({ -                            'url': src, -                        }) +                extract_subtitles(data.get('tracks')) + +            if not subtitles and not automatic_captions: +                text_tracks = self._parse_json( +                    self._search_regex( +                        r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html, +                        'text tracks', default='{}', group='data'), video_id, +                    transform_source=lambda s: js_to_json(unescapeHTML(s)), +                    fatal=False) +                extract_subtitles(text_tracks)          self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py new file mode 100644 index 000000000..30297b4dd --- /dev/null +++ b/youtube_dl/extractor/upskill.py @@ -0,0 +1,176 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( +    clean_html, +    ExtractorError, +    get_element_by_class, +    urlencode_postdata, +    urljoin, +) + + +class UpskillBaseIE(InfoExtractor): +    _LOGIN_URL = 'http://upskillcourses.com/sign_in' +    _NETRC_MACHINE = 'upskill' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            return + +        login_page, urlh = self._download_webpage_handle( +            self._LOGIN_URL, None, 'Downloading login page') + +        login_url = compat_str(urlh.geturl()) + +        login_form = self._hidden_inputs(login_page) + +        login_form.update({ +            'user[email]': username, +            'user[password]': password, +        }) + +        post_url = self._search_regex( +            r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, +            'post url', default=login_url, group='url') + +        if not post_url.startswith('http'): +            post_url = urljoin(login_url, post_url) + +        response = self._download_webpage( +            post_url, None, 'Logging in', +            data=urlencode_postdata(login_form), +            headers={ +                'Content-Type': 'application/x-www-form-urlencoded', +                'Referer': login_url, +            }) + +        # Successful login +        if any(re.search(p, response) for p in ( +                r'class=["\']user-signout', +                r'<a[^>]+\bhref=["\']/sign_out', +                r'>\s*Log out\s*<')): +            return + +        message = get_element_by_class('alert', response) +        if message is not None: +            raise ExtractorError( +                'Unable to login: %s' % clean_html(message), expected=True) + +        raise ExtractorError('Unable to log in') + + +class UpskillIE(UpskillBaseIE): +    _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)' + +    _TESTS = [{ +        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', +        'info_dict': { +            'id': 'uzw6zw58or', +            'ext': 'mp4', +            'title': 'Welcome to the Course!', +            'description': 'md5:8d66c13403783370af62ca97a7357bdd', +            'duration': 138.763, +            'timestamp': 1479846621, +            'upload_date': '20161122', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        wistia_url = WistiaIE._extract_url(webpage) +        if not wistia_url: +            if any(re.search(p, webpage) for p in ( +                    r'class=["\']lecture-contents-locked', +                    r'>\s*Lecture contents locked', +                    r'id=["\']lecture-locked')): +                self.raise_login_required('Lecture contents locked') + +        title = self._og_search_title(webpage, default=None) + +        return { +            '_type': 'url_transparent', +            'url': wistia_url, +            'ie_key': WistiaIE.ie_key(), +            'title': title, +        } + + +class UpskillCourseIE(UpskillBaseIE): +    _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', +        'info_dict': { +            'id': '119763', +            'title': 'The Essential Web Developer Course (Free)', +        }, +        'playlist_count': 192, +    }, { +        'url': 'http://upskillcourses.com/courses/119763/', +        'only_matching': True, +    }, { +        'url': 'http://upskillcourses.com/courses/enrolled/119763', +        'only_matching': True, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if UpskillIE.suitable(url) else super( +            UpskillCourseIE, cls).suitable(url) + +    def _real_extract(self, url): +        course_id = self._match_id(url) + +        webpage = self._download_webpage(url, course_id) + +        course_id = self._search_regex( +            r'data-course-id=["\'](\d+)', webpage, 'course id', +            default=course_id) + +        entries = [] + +        for mobj in re.finditer( +                r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', +                webpage): +            li = mobj.group('li') +            if 'fa-youtube-play' not in li: +                continue +            lecture_url = self._search_regex( +                r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, +                'lecture url', default=None, group='url') +            if not lecture_url: +                continue +            lecture_id = self._search_regex( +                r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) +            title = self._html_search_regex( +                r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, +                'title', default=None) +            entries.append( +                self.url_result( +                    urljoin('http://upskillcourses.com/', lecture_url), +                    ie=UpskillIE.ie_key(), video_id=lecture_id, +                    video_title=clean_html(title))) + +        course_title = self._html_search_regex( +            (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', +             r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), +            webpage, 'course title', fatal=False) + +        return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 0f5d68738..b20dddc5c 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,47 +12,46 @@ from ..utils import (  class VeohIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' +    _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' -    _TESTS = [ -        { -            'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', -            'md5': '620e68e6a3cff80086df3348426c9ca3', -            'info_dict': { -                'id': '56314296', -                'ext': 'mp4', -                'title': 'Straight Backs Are Stronger', -                'uploader': 'LUMOback', -                'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ', -            }, +    _TESTS = [{ +        'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', +        'md5': '620e68e6a3cff80086df3348426c9ca3', +        'info_dict': { +            'id': '56314296', +            'ext': 'mp4', +            'title': 'Straight Backs Are Stronger', +            'uploader': 'LUMOback', +            'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',          }, -        { -            'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', -            'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', -            'info_dict': { -                'id': '27701988', -                'ext': 'mp4', -                'title': 'Chile workers cover up to avoid skin damage', -                'description': 'md5:2bd151625a60a32822873efc246ba20d', -                'uploader': 'afp-news', -                'duration': 123, -            }, -            'skip': 'This video has been deleted.', +    }, { +        'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', +        'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', +        'info_dict': { +            'id': '27701988', +            'ext': 'mp4', +            'title': 'Chile workers cover up to avoid skin damage', +            'description': 'md5:2bd151625a60a32822873efc246ba20d', +            'uploader': 'afp-news', +            'duration': 123,          }, -        { -            'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', -            'md5': '4fde7b9e33577bab2f2f8f260e30e979', -            'note': 'Embedded ooyala video', -            'info_dict': { -                'id': '69525809', -                'ext': 'mp4', -                'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', -                'description': 'md5:f5a11c51f8fb51d2315bca0937526891', -                'uploader': 'newsy-videos', -            }, -            'skip': 'This video has been deleted.', +        'skip': 'This video has been deleted.', +    }, { +        'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', +        'md5': '4fde7b9e33577bab2f2f8f260e30e979', +        'note': 'Embedded ooyala video', +        'info_dict': { +            'id': '69525809', +            'ext': 'mp4', +            'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', +            'description': 'md5:f5a11c51f8fb51d2315bca0937526891', +            'uploader': 'newsy-videos',          }, -    ] +        'skip': 'This video has been deleted.', +    }, { +        'url': 'http://www.veoh.com/watch/e152215AJxZktGS', +        'only_matching': True, +    }]      def _extract_formats(self, source):          formats = [] diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 0f8c156a7..c21a09c01 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -42,7 +42,7 @@ class VGTVIE(XstreamIE):                      )                      /?                      (?: -                        \#!/(?:video|live)/| +                        (?:\#!/)?(?:video|live)/|                          embed?.*id=|                          articles/                      )| @@ -146,7 +146,11 @@ class VGTVIE(XstreamIE):          {              'url': 'abtv:140026',              'only_matching': True, -        } +        }, +        { +            'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', +            'only_matching': True, +        },      ]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py index 6be3774b7..570fa45ea 100644 --- a/youtube_dl/extractor/vh1.py +++ b/youtube_dl/extractor/vh1.py @@ -121,7 +121,11 @@ class VH1IE(MTVIE):          idoc = self._download_xml(              doc_url, video_id,              'Downloading info', transform_source=fix_xml_ampersands) -        return self.playlist_result( -            [self._get_video_info(item) for item in idoc.findall('.//item')], -            playlist_id=video_id, -        ) + +        entries = [] +        for item in idoc.findall('.//item'): +            info = self._get_video_info(item) +            if info: +                entries.append(info) + +        return self.playlist_result(entries, playlist_id=video_id) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index f0a7fd739..b8b8bf979 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,6 +7,7 @@ import hashlib  import json  from .adobepass import AdobePassIE +from .youtube import YoutubeIE  from .common import InfoExtractor  from ..compat import compat_HTTPError  from ..utils import ( @@ -20,7 +21,7 @@ from ..utils import (  class ViceBaseIE(AdobePassIE): -    def _extract_preplay_video(self, url, webpage): +    def _extract_preplay_video(self, url, locale, webpage):          watch_hub_data = extract_attributes(self._search_regex(              r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))          video_id = watch_hub_data['vms-id'] @@ -32,7 +33,8 @@ class ViceBaseIE(AdobePassIE):              resource = self._get_mvpd_resource(                  'VICELAND', title, video_id,                  watch_hub_data.get('video-rating')) -            query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) +            query['tvetoken'] = self._extract_mvpd_auth( +                url, video_id, 'VICELAND', resource)          # signature generation algorithm is reverse engineered from signatureGenerator in          # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -45,11 +47,14 @@ class ViceBaseIE(AdobePassIE):          try:              host = 'www.viceland' if is_locked else self._PREPLAY_HOST -            preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) +            preplay = self._download_json( +                'https://%s.com/%s/preplay/%s' % (host, locale, video_id), +                video_id, query=query)          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:                  error = json.loads(e.cause.read().decode()) -                raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) +                raise ExtractorError('%s said: %s' % ( +                    self.IE_NAME, error['details']), expected=True)              raise          video_data = preplay['video'] @@ -88,41 +93,30 @@ class ViceBaseIE(AdobePassIE):  class ViceIE(ViceBaseIE): -    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' +    IE_NAME = 'vice' +    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'      _TESTS = [{ -        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', -        'md5': 'e9d77741f9e42ba583e683cd170660f7', +        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', +        'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',          'info_dict': { -            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', +            'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',              'ext': 'flv', -            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', -            'duration': 725.983, +            'title': 'Monkey Labs of Holland', +            'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',          },          'add_ie': ['Ooyala'],      }, { -        'url': 'http://www.vice.com/video/how-to-hack-a-car', -        'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', -        'info_dict': { -            'id': '3jstaBeXgAs', -            'ext': 'mp4', -            'title': 'How to Hack a Car: Phreaked Out (Episode 2)', -            'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', -            'uploader_id': 'MotherboardTV', -            'uploader': 'Motherboard', -            'upload_date': '20140529', -        }, -        'add_ie': ['Youtube'], -    }, {          'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', -        'md5': '',          'info_dict': {              'id': '5816510690b70e6c5fd39a56',              'ext': 'mp4',              'uploader': 'Waypoint',              'title': 'The Signal From Tölva', +            'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',              'uploader_id': '57f7d621e05ca860fa9ccaf9', -            'timestamp': 1477941983938, +            'timestamp': 1477941983, +            'upload_date': '20161031',          },          'params': {              # m3u8 download @@ -130,19 +124,31 @@ class ViceIE(ViceBaseIE):          },          'add_ie': ['UplynkPreplay'],      }, { -        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', -        'only_matching': True, -    }, { -        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', -        'only_matching': True, +        'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', +        'info_dict': { +            'id': '581b12b60a0e1f4c0fb6ea2f', +            'ext': 'mp4', +            'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', +            'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', +            'uploader': 'VICE', +            'uploader_id': '57a204088cb727dec794c67b', +            'timestamp': 1485368119, +            'upload_date': '20170125', +            'age_limit': 14, +        }, +        'params': { +            # AES-encrypted m3u8 +            'skip_download': True, +        }, +        'add_ie': ['UplynkPreplay'],      }, { -        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', +        'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',          'only_matching': True,      }]      _PREPLAY_HOST = 'video.vice'      def _real_extract(self, url): -        video_id = self._match_id(url) +        locale, video_id = re.match(self._VALID_URL, url).groups()          webpage, urlh = self._download_webpage_handle(url, video_id)          embed_code = self._search_regex(              r'embedCode=([^&\'"]+)', webpage, @@ -153,10 +159,11 @@ class ViceIE(ViceBaseIE):              r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)          if youtube_id:              return self.url_result(youtube_id, 'Youtube') -        return self._extract_preplay_video(urlh.geturl(), webpage) +        return self._extract_preplay_video(urlh.geturl(), locale, webpage)  class ViceShowIE(InfoExtractor): +    IE_NAME = 'vice:show'      _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'      _TEST = { @@ -183,6 +190,84 @@ class ViceShowIE(InfoExtractor):              r'<title>(.+?)</title>', webpage, 'title', default=None)          if title:              title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() -        description = self._html_search_meta('description', webpage, 'description') +        description = self._html_search_meta( +            'description', webpage, 'description')          return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): +    IE_NAME = 'vice:article' +    _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)' + +    _TESTS = [{ +        'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', +        'info_dict': { +            'id': '58dc0a3dee202d2a0ccfcbd8', +            'ext': 'mp4', +            'title': 'Mormon War on Porn ', +            'description': 'md5:ad396a2481e7f8afb5ed486878421090', +            'uploader': 'VICE', +            'uploader_id': '57a204088cb727dec794c693', +            'timestamp': 1489160690, +            'upload_date': '20170310', +        }, +        'params': { +            # AES-encrypted m3u8 +            'skip_download': True, +        }, +        'add_ie': ['UplynkPreplay'], +    }, { +        'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', +        'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', +        'info_dict': { +            'id': '3jstaBeXgAs', +            'ext': 'mp4', +            'title': 'How to Hack a Car: Phreaked Out (Episode 2)', +            'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', +            'uploader_id': 'MotherboardTV', +            'uploader': 'Motherboard', +            'upload_date': '20140529', +        }, +        'add_ie': ['Youtube'], +    }, { +        'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', +        'only_matching': True, +    }, { +        'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        prefetch_data = self._parse_json(self._search_regex( +            r'window\.__PREFETCH_DATA\s*=\s*({.*});', +            webpage, 'prefetch data'), display_id) +        body = prefetch_data['body'] + +        def _url_res(video_url, ie_key): +            return { +                '_type': 'url_transparent', +                'url': video_url, +                'display_id': display_id, +                'ie_key': ie_key, +            } + +        embed_code = self._search_regex( +            r'embedCode=([^&\'"]+)', body, +            'ooyala embed code', default=None) +        if embed_code: +            return _url_res('ooyala:%s' % embed_code, 'Ooyala') + +        youtube_url = YoutubeIE._extract_url(body) +        if youtube_url: +            return _url_res(youtube_url, YoutubeIE.ie_key()) + +        video_url = self._html_search_regex( +            r'data-video-url="([^"]+)"', +            prefetch_data['embed_code'], 'video URL') + +        return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 87f9216b5..bd60235c8 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -1,11 +1,13 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .vice import ViceBaseIE  class VicelandIE(ViceBaseIE): -    _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'      _TEST = {          'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',          'info_dict': { @@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE):              'skip_download': True,          },          'add_ie': ['UplynkPreplay'], +        'skip': '404',      }      _PREPLAY_HOST = 'www.viceland'      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        locale = mobj.group('locale')          webpage = self._download_webpage(url, video_id) -        return self._extract_preplay_video(url, webpage) +        return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 701bb1d01..01da32f1c 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -56,7 +56,8 @@ class VidioIE(InfoExtractor):          self._sort_formats(formats)          duration = int_or_none(duration or self._search_regex( -            r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) +            r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, +            'duration', fatal=False, group='duration'))          thumbnail = thumbnail or self._og_search_thumbnail(webpage)          like_count = int_or_none(self._search_regex( diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e9ff336c4..59adb2377 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals  import itertools  from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( +    compat_HTTPError, +    compat_str, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor):                  'or for violating the terms of use.',                  expected=True) -        formats = [{ -            'format_id': f.get('type'), -            'url': f['uri'], -            'width': int_or_none(f.get('width')), -            'height': int_or_none(f.get('height')), -            'preference': 0 if f.get('type', '').endswith('clip') else 1, -        } for f in video.get('formats', []) if f.get('uri')] +        formats = [] +        for f in video.get('formats', []): +            format_url = f.get('uri') +            if not format_url or not isinstance(format_url, compat_str): +                continue +            format_type = f.get('type') +            if format_type == 'dash': +                formats.extend(self._extract_mpd_formats( +                    format_url, video_id, mpd_id='dash', fatal=False)) +            elif format_type == 'hls': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            else: +                formats.append({ +                    'format_id': f.get('type'), +                    'url': format_url, +                    'width': int_or_none(f.get('width')), +                    'height': int_or_none(f.get('height')), +                    'preference': 0 if f.get('type', '').endswith( +                        'clip') else 1, +                })          if not formats and video.get('complete_url'):              formats.append({ @@ -245,29 +263,35 @@ class VidmeListBaseIE(InfoExtractor):  class VidmeUserIE(VidmeListBaseIE):      IE_NAME = 'vidme:user' -    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)'      _API_ITEM = 'list'      _TITLE = 'Videos' -    _TEST = { -        'url': 'https://vid.me/EFARCHIVE', +    _TESTS = [{ +        'url': 'https://vid.me/MasakoX',          'info_dict': { -            'id': '3834632', -            'title': 'EFARCHIVE - %s' % _TITLE, +            'id': '16112341', +            'title': 'MasakoX - %s' % _TITLE,          }, -        'playlist_mincount': 238, -    } +        'playlist_mincount': 191, +    }, { +        'url': 'https://vid.me/unsQuare_netWork', +        'only_matching': True, +    }]  class VidmeUserLikesIE(VidmeListBaseIE):      IE_NAME = 'vidme:user:likes' -    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' +    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes'      _API_ITEM = 'likes'      _TITLE = 'Likes' -    _TEST = { +    _TESTS = [{          'url': 'https://vid.me/ErinAlexis/likes',          'info_dict': {              'id': '6483530',              'title': 'ErinAlexis - %s' % _TITLE,          },          'playlist_mincount': 415, -    } +    }, { +        'url': 'https://vid.me/Kaleidoscope-Ish/likes', +        'only_matching': True, +    }] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5ef7635b6..dbd5ba9ba 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,24 +5,44 @@ import re  import itertools  from .common import InfoExtractor +from ..utils import ( +    urlencode_postdata, +    int_or_none, +    unified_strdate, +)  class VierIE(InfoExtractor):      IE_NAME = 'vier'      IE_DESC = 'vier.be and vijf.be' -    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:www\.)?(?P<site>vier|vijf)\.be/ +                        (?: +                            (?: +                                [^/]+/videos| +                                video(?:/[^/]+)* +                            )/ +                            (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| +                            (?: +                                video/v3/embed| +                                embed/video/public +                            )/(?P<embed_id>\d+) +                        ) +                    ''' +    _NETRC_MACHINE = 'vier'      _TESTS = [{          'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', +        'md5': 'e4ae2054a6b040ef1e289e20d111b46e',          'info_dict': {              'id': '16129',              'display_id': 'het-wordt-warm-de-moestuin',              'ext': 'mp4',              'title': 'Het wordt warm in De Moestuin',              'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', -        }, -        'params': { -            # m3u8 download -            'skip_download': True, +            'upload_date': '20121025', +            'series': 'Plan B', +            'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],          },      }, {          'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', @@ -30,46 +50,145 @@ class VierIE(InfoExtractor):              'id': '2561614',              'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',              'ext': 'mp4', -            'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', -            'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', +            'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', +            'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', +            'upload_date': '20170228', +            'series': 'Temptation Island', +            'tags': list, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', +        'info_dict': { +            'id': '2674839', +            'display_id': 'jani-gaat-naar-tokio-aflevering-4', +            'ext': 'mp4', +            'title': 'Jani gaat naar Tokio - Aflevering 4', +            'description': 'md5:aa8d611541db6ae9e863125704511f88', +            'upload_date': '20170501', +            'series': 'Jani gaat', +            'episode_number': 4, +            'tags': ['Jani Gaat', 'Volledige Aflevering'], +        }, +        'params': { +            'skip_download': True, +        }, +        'skip': 'Requires account credentials', +    }, { +        # Requires account credentials but bypassed extraction via v3/embed page +        # without metadata +        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', +        'info_dict': { +            'id': '2674839', +            'display_id': 'jani-gaat-naar-tokio-aflevering-4', +            'ext': 'mp4', +            'title': 'jani-gaat-naar-tokio-aflevering-4',          },          'params': { -            # m3u8 download              'skip_download': True,          }, +        'expected_warnings': ['Log in to extract metadata'],      }, { -        'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', +        # Without video id in URL +        'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',          'only_matching': True,      }, {          'url': 'http://www.vier.be/video/v3/embed/16129',          'only_matching': True, +    }, { +        'url': 'https://www.vijf.be/embed/video/public/4093', +        'only_matching': True, +    }, { +        'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', +        'only_matching': True, +    }, { +        'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', +        'only_matching': True,      }] +    def _real_initialize(self): +        self._logged_in = False + +    def _login(self, site): +        username, password = self._get_login_info() +        if username is None or password is None: +            return + +        login_page = self._download_webpage( +            'http://www.%s.be/user/login' % site, +            None, note='Logging in', errnote='Unable to log in', +            data=urlencode_postdata({ +                'form_id': 'user_login', +                'name': username, +                'pass': password, +            }), +            headers={'Content-Type': 'application/x-www-form-urlencoded'}) + +        login_error = self._html_search_regex( +            r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', +            login_page, 'login error', default=None) +        if login_error: +            self.report_warning('Unable to log in: %s' % login_error) +        else: +            self._logged_in = True +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          embed_id = mobj.group('embed_id')          display_id = mobj.group('display_id') or embed_id +        video_id = mobj.group('id') or embed_id          site = mobj.group('site') +        if not self._logged_in: +            self._login(site) +          webpage = self._download_webpage(url, display_id) +        if r'id="user-login"' in webpage: +            self.report_warning( +                'Log in to extract metadata', video_id=display_id) +            webpage = self._download_webpage( +                'http://www.%s.be/video/v3/embed/%s' % (site, video_id), +                display_id) +          video_id = self._search_regex(              [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], -            webpage, 'video id') -        application = self._search_regex( -            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], -            webpage, 'application', default=site + '_vod') -        filename = self._search_regex( -            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], -            webpage, 'filename') - -        playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) -        formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) +            webpage, 'video id', default=video_id or display_id) + +        playlist_url = self._search_regex( +            r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', +            webpage, 'm3u8 url', default=None, group='url') + +        if not playlist_url: +            application = self._search_regex( +                [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], +                webpage, 'application', default=site + '_vod') +            filename = self._search_regex( +                [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], +                webpage, 'filename') +            playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) + +        formats = self._extract_wowza_formats( +            playlist_url, display_id, skip_protocols=['dash'])          self._sort_formats(formats)          title = self._og_search_title(webpage, default=display_id) -        description = self._og_search_description(webpage, default=None) +        description = self._html_search_regex( +            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', +            webpage, 'description', default=None, group='value')          thumbnail = self._og_search_thumbnail(webpage, default=None) +        upload_date = unified_strdate(self._html_search_regex( +            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', +            webpage, 'upload date', default=None, group='value')) + +        series = self._search_regex( +            r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, +            'series', default=None, group='value') +        episode_number = int_or_none(self._search_regex( +            r'(?i)aflevering (\d+)', title, 'episode number', default=None)) +        tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)          return {              'id': video_id, @@ -77,6 +196,10 @@ class VierIE(InfoExtractor):              'title': title,              'description': description,              'thumbnail': thumbnail, +            'upload_date': upload_date, +            'series': series, +            'episode_number': episode_number, +            'tags': tags,              'formats': formats,          } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 4adcd1830..a0abbae60 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,12 +4,14 @@ import re  from .common import InfoExtractor  from ..compat import ( -    compat_urlparse, +    compat_HTTPError,      compat_str, +    compat_urlparse,  )  from ..utils import ( -    parse_duration, +    ExtractorError,      js_to_json, +    parse_duration,      parse_iso8601,  ) @@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor):          base_url = self._proto_relative_url(cfg['livepipe'], 'http:') -        lecture_data = self._download_json( -            '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), -            lecture_id)['lecture'][0] +        try: +            lecture_data = self._download_json( +                '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), +                lecture_id)['lecture'][0] +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: +                msg = self._parse_json( +                    e.cause.read().decode('utf-8'), lecture_id) +                raise ExtractorError(msg['detail'], expected=True) +            raise          lecture_info = {              'id': lecture_id, diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 61cc469bf..c3f71b45e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -151,10 +151,16 @@ class VimeoBaseInfoExtractor(InfoExtractor):                      else:                          mpd_manifest_urls = [(format_id, manifest_url)]                      for f_id, m_url in mpd_manifest_urls: -                        formats.extend(self._extract_mpd_formats( +                        mpd_formats = self._extract_mpd_formats(                              m_url.replace('/master.json', '/master.mpd'), video_id, f_id,                              'Downloading %s MPD information' % cdn_name, -                            fatal=False)) +                            fatal=False) +                        for f in mpd_formats: +                            if f.get('vcodec') == 'none': +                                f['preference'] = -50 +                            elif f.get('acodec') == 'none': +                                f['preference'] = -40 +                        formats.extend(mpd_formats)          subtitles = {}          text_tracks = config['request'].get('text_tracks') @@ -609,7 +615,10 @@ class VimeoIE(VimeoBaseInfoExtractor):                  if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):                      source_name = source_file.get('public_name', 'Original')                      if self._is_valid_url(download_url, video_id, '%s video' % source_name): -                        ext = source_file.get('extension', determine_ext(download_url)).lower() +                        ext = (try_get( +                            source_file, lambda x: x['extension'], +                            compat_str) or determine_ext( +                            download_url, None) or 'mp4').lower()                          formats.append({                              'url': download_url,                              'ext': ext, diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 4957a07f7..46950d3a1 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -92,10 +92,12 @@ class VineIE(InfoExtractor):          username = data.get('username') +        alt_title = 'Vine by %s' % username if username else None +          return {              'id': video_id, -            'title': data.get('description'), -            'alt_title': 'Vine by %s' % username if username else None, +            'title': data.get('description') or alt_title or 'Vine video', +            'alt_title': alt_title,              'thumbnail': data.get('thumbnailUrl'),              'timestamp': unified_timestamp(data.get('created')),              'uploader': username, diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index db6a65d2e..5cf93591c 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_kwargs, +    compat_str, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -36,7 +39,8 @@ class ViuBaseIE(InfoExtractor):          headers.update(kwargs.get('headers', {}))          kwargs['headers'] = headers          response = self._download_json( -            'https://www.viu.com/api/' + path, *args, **kwargs)['response'] +            'https://www.viu.com/api/' + path, *args, +            **compat_kwargs(kwargs))['response']          if response.get('status') != 'success':              raise ExtractorError('%s said: %s' % (                  self.IE_NAME, response['message']), expected=True) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index dc2719cf9..105e172d5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -25,6 +25,7 @@ from ..utils import (  from .dailymotion import DailymotionIE  from .pladform import PladformIE  from .vimeo import VimeoIE +from .youtube import YoutubeIE  class VKBaseIE(InfoExtractor): @@ -345,11 +346,9 @@ class VKIE(VKBaseIE):              if re.search(error_re, info_page):                  raise ExtractorError(error_msg % video_id, expected=True) -        youtube_url = self._search_regex( -            r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', -            info_page, 'youtube iframe', default=None) +        youtube_url = YoutubeIE._extract_url(info_page)          if youtube_url: -            return self.url_result(youtube_url, 'Youtube') +            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())          vimeo_url = VimeoIE._extract_url(url, info_page)          if vimeo_url is not None: diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e58940607..64d0224e6 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor):          },      }] +    @classmethod +    def suitable(cls, url): +        return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) +      def _real_extract(self, url):          video_id = self._match_id(url) @@ -232,7 +236,12 @@ class VLiveChannelIE(InfoExtractor):                  query={                      'app_id': app_id,                      'channelSeq': channel_seq, -                    'maxNumOfRows': 1000, +                    # Large values of maxNumOfRows (~300 or above) may cause +                    # empty responses (see [1]), e.g. this happens for [2] that +                    # has more than 300 videos. +                    # 1. https://github.com/rg3/youtube-dl/issues/13830 +                    # 2. http://channels.vlive.tv/EDBF. +                    'maxNumOfRows': 100,                      '_': int(time.time()),                      'pageNo': page_num                  } @@ -261,3 +270,54 @@ class VLiveChannelIE(InfoExtractor):          return self.playlist_result(              entries, channel_code, channel_name) + + +class VLivePlaylistIE(InfoExtractor): +    IE_NAME = 'vlive:playlist' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.vlive.tv/video/22867/playlist/22912', +        'info_dict': { +            'id': '22912', +            'title': 'Valentine Day Message from TWICE' +        }, +        'playlist_mincount': 9 +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id, playlist_id = mobj.group('video_id', 'id') + +        VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' +        if self._downloader.params.get('noplaylist'): +            self.to_screen( +                'Downloading just video %s because of --no-playlist' % video_id) +            return self.url_result( +                VIDEO_URL_TEMPLATE % video_id, +                ie=VLiveIE.ie_key(), video_id=video_id) + +        self.to_screen( +            'Downloading playlist %s - add --no-playlist to just download video' +            % playlist_id) + +        webpage = self._download_webpage( +            'http://www.vlive.tv/video/%s/playlist/%s' +            % (video_id, playlist_id), playlist_id) + +        item_ids = self._parse_json( +            self._search_regex( +                r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, +                'playlist video seqs'), +            playlist_id) + +        entries = [ +            self.url_result( +                VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), +                video_id=compat_str(item_id)) +            for item_id in item_ids] + +        playlist_name = self._html_search_regex( +            r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', +            webpage, 'playlist title', fatal=False) + +        return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py new file mode 100644 index 000000000..5de3deb8c --- /dev/null +++ b/youtube_dl/extractor/voot.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( +    ExtractorError, +    int_or_none, +    try_get, +    unified_timestamp, +) + + +class VootIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)' +    _GEO_COUNTRIES = ['IN'] +    _TESTS = [{ +        'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', +        'info_dict': { +            'id': '0_8ledb18o', +            'ext': 'mp4', +            'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', +            'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', +            'uploader_id': 'batchUser', +            'timestamp': 1472162937, +            'upload_date': '20160825', +            'duration': 1146, +            'series': 'Ishq Ka Rang Safed', +            'season_number': 1, +            'episode': 'Is this the end of Kamini?', +            'episode_number': 340, +            'view_count': int, +            'like_count': int, +        }, +        'params': { +            'skip_download': True, +        }, +        'expected_warnings': ['Failed to download m3u8 information'], +    }, { +        'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', +        'only_matching': True, +    }, { +        'url': 'https://www.voot.com/movies/pandavas-5/424627', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        media_info = self._download_json( +            'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, +            query={ +                'platform': 'Web', +                'pId': 2, +                'mediaId': video_id, +            }) + +        status_code = try_get(media_info, lambda x: x['status']['code'], int) +        if status_code != 0: +            raise ExtractorError(media_info['status']['message'], expected=True) + +        media = media_info['assets'] + +        entry_id = media['EntryId'] +        title = media['MediaName'] + +        description, series, season_number, episode, episode_number = [None] * 5 + +        for meta in try_get(media, lambda x: x['Metas'], list) or []: +            key, value = meta.get('Key'), meta.get('Value') +            if not key or not value: +                continue +            if key == 'ContentSynopsis': +                description = value +            elif key == 'RefSeriesTitle': +                series = value +            elif key == 'RefSeriesSeason': +                season_number = int_or_none(value) +            elif key == 'EpisodeMainTitle': +                episode = value +            elif key == 'EpisodeNo': +                episode_number = int_or_none(value) + +        return { +            '_type': 'url_transparent', +            'url': 'kaltura:1982551:%s' % entry_id, +            'ie_key': KalturaIE.ie_key(), +            'title': title, +            'description': description, +            'series': series, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number, +            'timestamp': unified_timestamp(media.get('CreationDate')), +            'duration': int_or_none(media.get('Duration')), +            'view_count': int_or_none(media.get('ViewCounter')), +            'like_count': int_or_none(media.get('like_counter')), +        } diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 487047fd7..9959627c0 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE):          audio_locale = streams_json.get('audio_locale')          formats = [] -        for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): -            stream_url = stream.get('url') -            if not stream_url: -                continue -            stream_id = stream_id or audio_locale -            m3u8_formats = self._extract_m3u8_formats( -                stream_url, video_id, 'mp4', m3u8_id=stream_id, -                note='Downloading %s m3u8 information' % stream_id, -                fatal=False) -            if audio_locale: -                for f in m3u8_formats: -                    f['language'] = audio_locale -            formats.extend(m3u8_formats) +        for stream_type, streams in streams_json.get('streams', {}).items(): +            if stream_type in ('adaptive_hls', 'adaptive_dash'): +                for stream in streams.values(): +                    stream_url = stream.get('url') +                    if not stream_url: +                        continue +                    stream_id = stream.get('hardsub_locale') or audio_locale +                    format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) +                    if stream_type == 'adaptive_hls': +                        adaptive_formats = self._extract_m3u8_formats( +                            stream_url, video_id, 'mp4', m3u8_id=format_id, +                            note='Downloading %s m3u8 information' % stream_id, +                            fatal=False) +                    else: +                        adaptive_formats = self._extract_mpd_formats( +                            stream_url, video_id, mpd_id=format_id, +                            note='Downloading %s MPD information' % stream_id, +                            fatal=False) +                    if audio_locale: +                        for f in adaptive_formats: +                            if f.get('acodec') != 'none': +                                f['language'] = audio_locale +                    formats.extend(adaptive_formats)          self._sort_formats(formats) +        subtitles = {} +        for subtitle in streams_json.get('subtitles', {}).values(): +            subtitle_url = subtitle.get('url') +            if not subtitle_url: +                continue +            subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ +                'url': subtitle_url, +                'ext': subtitle.get('format', 'ass'), +            }) +          thumbnails = []          for thumbnail in video_data.get('images', {}).get('thumbnails', []):              thumbnail_url = thumbnail.get('source') @@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE):              'id': video_id,              'title': title,              'formats': formats, +            'subtitles': subtitles,              'thumbnails': thumbnails,              'description': video_data.get('description'),              'duration': float_or_none(video_data.get('duration_ms'), 1000), diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index b270f08d1..02fcd52c7 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -1,6 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      int_or_none, @@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor):          },      }] +    @staticmethod +    def _extract_urls(webpage): +        return re.findall( +            r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', +            webpage) +      def _real_extract(self, url):          video_id = self._match_id(url)          video_data = self._download_json( diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py new file mode 100644 index 000000000..b382338fa --- /dev/null +++ b/youtube_dl/extractor/watchbox.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    js_to_json, +    strip_or_none, +    try_get, +    unified_timestamp, +) + + +class WatchBoxIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' +    _TESTS = [{ +        # film +        'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', +        'info_dict': { +            'id': '341368', +            'ext': 'mp4', +            'title': 'Free Jimmy', +            'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 4890, +            'age_limit': 16, +            'release_year': 2009, +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        }, +        'expected_warnings': ['Failed to download m3u8 information'], +    }, { +        # episode +        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', +        'info_dict': { +            'id': '328286', +            'ext': 'mp4', +            'title': 'S01 E01 - Date in der Hölle', +            'description': 'md5:2f31c74a8186899f33cb5114491dae2b', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 1291, +            'age_limit': 12, +            'release_year': 2010, +            'series': 'Ugly Americans', +            'season_number': 1, +            'episode': 'Date in der Hölle', +            'episode_number': 1, +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        }, +        'expected_warnings': ['Failed to download m3u8 information'], +    }, { +        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        kind, video_id = mobj.group('kind', 'id') + +        webpage = self._download_webpage(url, video_id) + +        source = self._parse_json( +            self._search_regex( +                r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', +                default='{}'), +            video_id, transform_source=js_to_json, fatal=False) or {} + +        video_id = compat_str(source.get('videoId') or video_id) + +        devapi = self._download_json( +            'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ +                'format': 'json', +                'apikey': 'hbbtv', +            }, fatal=False) + +        item = try_get(devapi, lambda x: x['items'][0], dict) or {} + +        title = item.get('title') or try_get( +            item, lambda x: x['movie']['headline_movie'], +            compat_str) or source['title'] + +        formats = [] +        hls_url = item.get('media_videourl_hls') or source.get('hls') +        if hls_url: +            formats.extend(self._extract_m3u8_formats( +                hls_url, video_id, 'mp4', entry_protocol='m3u8_native', +                m3u8_id='hls', fatal=False)) +        dash_url = item.get('media_videourl_wv') or source.get('dash') +        if dash_url: +            formats.extend(self._extract_mpd_formats( +                dash_url, video_id, mpd_id='dash', fatal=False)) +        mp4_url = item.get('media_videourl') +        if mp4_url: +            formats.append({ +                'url': mp4_url, +                'format_id': 'mp4', +                'width': int_or_none(item.get('width')), +                'height': int_or_none(item.get('height')), +                'tbr': int_or_none(item.get('bitrate')), +            }) +        self._sort_formats(formats) + +        description = strip_or_none(item.get('descr')) +        thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') +        duration = int_or_none(item.get('media_length') or source.get('length')) +        timestamp = unified_timestamp(item.get('pubDate')) +        view_count = int_or_none(item.get('media_views')) +        age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) +        release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) + +        info = { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'view_count': view_count, +            'age_limit': age_limit, +            'release_year': release_year, +            'formats': formats, +        } + +        if kind.lower() == 'serien': +            series = try_get( +                item, lambda x: x['special']['title'], +                compat_str) or source.get('format') +            season_number = int_or_none(self._search_regex( +                r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', +                default=None) or self._search_regex( +                    r'/staffel-(\d+)/', url, 'season number', default=None)) +            episode = source.get('title') +            episode_number = int_or_none(self._search_regex( +                r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', +                default=None)) +            info.update({ +                'series': series, +                'season_number': season_number, +                'episode': episode, +                'episode_number': episode_number, +            }) + +        return info diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py index ed099beea..fadc539ee 100644 --- a/youtube_dl/extractor/watchindianporn.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ( -    unified_strdate, -    parse_duration, -    int_or_none, -) +from ..utils import parse_duration  class WatchIndianPornIE(InfoExtractor): @@ -23,11 +19,8 @@ class WatchIndianPornIE(InfoExtractor):              'ext': 'mp4',              'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera',              'thumbnail': r're:^https?://.*\.jpg$', -            'uploader': 'LoveJay', -            'upload_date': '20160428',              'duration': 226,              'view_count': int, -            'comment_count': int,              'categories': list,              'age_limit': 18,          } @@ -40,51 +33,36 @@ class WatchIndianPornIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        video_url = self._html_search_regex( -            r"url: escape\('([^']+)'\)", webpage, 'url') +        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] -        title = self._html_search_regex( -            r'<h2 class="he2"><span>(.*?)</span>', -            webpage, 'title') -        thumbnail = self._html_search_regex( -            r'<span id="container"><img\s+src="([^"]+)"', -            webpage, 'thumbnail', fatal=False) - -        uploader = self._html_search_regex( -            r'class="aupa">\s*(.*?)</a>', -            webpage, 'uploader') -        upload_date = unified_strdate(self._html_search_regex( -            r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) +        title = self._html_search_regex(( +            r'<title>(.+?)\s*-\s*Indian\s+Porn</title>', +            r'<h4>(.+?)</h4>' +        ), webpage, 'title')          duration = parse_duration(self._search_regex( -            r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', +            r'Time:\s*<strong>\s*(.+?)\s*</strong>',              webpage, 'duration', fatal=False)) -        view_count = int_or_none(self._search_regex( -            r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', +        view_count = int(self._search_regex( +            r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>',              webpage, 'view count', fatal=False)) -        comment_count = int_or_none(self._search_regex( -            r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', -            webpage, 'comment count', fatal=False))          categories = re.findall( -            r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', +            r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>',              webpage) -        return { +        info_dict.update({              'id': video_id,              'display_id': display_id, -            'url': video_url,              'http_headers': {                  'Referer': url,              },              'title': title, -            'thumbnail': thumbnail, -            'uploader': uploader, -            'upload_date': upload_date,              'duration': duration,              'view_count': view_count, -            'comment_count': comment_count,              'categories': categories,              'age_limit': 18, -        } +        }) + +        return info_dict diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index c634b8dec..2182d6fd4 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,10 +1,13 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      ExtractorError,      int_or_none,      float_or_none, +    unescapeHTML,  ) @@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def _extract_url(webpage): +        match = re.search( +            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) +        if match: +            return unescapeHTML(match.group('url')) + +        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) +        if match: +            return 'wistia:%s' % match.group('id') + +        match = re.search( +            r'''(?sx) +                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? +                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 +            ''', webpage) +        if match: +            return 'wistia:%s' % match.group('id') +      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 45cfca7c5..9b5487710 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -13,7 +13,7 @@ class WSJIE(InfoExtractor):      _VALID_URL = r'''(?x)                          (?:                              https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| -                            https?://(?:www\.)?wsj\.com/video/[^/]+/| +                            https?://(?:www\.)?(?:wsj|barrons)\.com/video/[^/]+/|                              wsj:                          )                          (?P<id>[a-fA-F0-9-]{36}) @@ -35,6 +35,9 @@ class WSJIE(InfoExtractor):      }, {          'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',          'only_matching': True, +    }, { +        'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 13f8be6cb..ad747978d 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -10,7 +10,6 @@ from ..utils import (      ExtractorError,      int_or_none,      NO_DEFAULT, -    sanitized_Request,      urlencode_postdata,  ) @@ -30,6 +29,8 @@ class XFileShareIE(InfoExtractor):          (r'vidabc\.com', 'Vid ABC'),          (r'vidbom\.com', 'VidBom'),          (r'vidlo\.us', 'vidlo'), +        (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), +        (r'fastvideo\.me', 'FastVideo.me'),      )      IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) @@ -109,6 +110,12 @@ class XFileShareIE(InfoExtractor):          'params': {              'skip_download': True,          }, +    }, { +        'url': 'http://www.rapidvideo.cool/b667kprndr8w', +        'only_matching': True, +    }, { +        'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', +        'only_matching': True      }]      def _real_extract(self, url): @@ -130,12 +137,12 @@ class XFileShareIE(InfoExtractor):              if countdown:                  self._sleep(countdown, video_id) -            post = urlencode_postdata(fields) - -            req = sanitized_Request(url, post) -            req.add_header('Content-type', 'application/x-www-form-urlencoded') - -            webpage = self._download_webpage(req, video_id, 'Downloading video page') +            webpage = self._download_webpage( +                url, video_id, 'Downloading video page', +                data=urlencode_postdata(fields), headers={ +                    'Referer': url, +                    'Content-type': 'application/x-www-form-urlencoded', +                })          title = (self._search_regex(              (r'style="z-index: [0-9]+;">([^<]+)</span>', @@ -150,7 +157,7 @@ class XFileShareIE(InfoExtractor):          def extract_formats(default=NO_DEFAULT):              urls = []              for regex in ( -                    r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', +                    r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',                      r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',                      r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',                      r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7b6703714..c42b59e51 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import ( +    clean_html,      dict_get,      ExtractorError,      int_or_none, @@ -13,29 +15,41 @@ from ..utils import (  class XHamsterIE(InfoExtractor): -    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:.+?\.)?xhamster\.com/ +                        (?: +                            movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html| +                            videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+) +                        ) +                    ''' +      _TESTS = [{          'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',          'md5': '8281348b8d3c53d39fffb377d24eac4e',          'info_dict': {              'id': '1509445', +            'display_id': 'femaleagent_shy_beauty_takes_the_bait',              'ext': 'mp4',              'title': 'FemaleAgent Shy beauty takes the bait',              'upload_date': '20121014',              'uploader': 'Ruseful2011',              'duration': 893,              'age_limit': 18, +            'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy'],          },      }, {          'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',          'info_dict': {              'id': '2221348', +            'display_id': 'britney_spears_sexy_booty',              'ext': 'mp4',              'title': 'Britney Spears  Sexy Booty',              'upload_date': '20130914',              'uploader': 'jojo747400',              'duration': 200,              'age_limit': 18, +            'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'],          },          'params': {              'skip_download': True, @@ -51,6 +65,7 @@ class XHamsterIE(InfoExtractor):              'uploader': 'parejafree',              'duration': 72,              'age_limit': 18, +            'categories': ['Amateur', 'Blowjobs'],          },          'params': {              'skip_download': True, @@ -62,26 +77,18 @@ class XHamsterIE(InfoExtractor):          # This video is visible for marcoalfa123456's friends only          'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',          'only_matching': True, +    }, { +        # new URL schema +        'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', +        'only_matching': True,      }]      def _real_extract(self, url): -        def extract_video_url(webpage, name): -            return self._search_regex( -                [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', -                 r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', -                 r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], -                webpage, name, group='mp4') - -        def is_hd(webpage): -            return '<div class=\'icon iconHD\'' in webpage -          mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') or mobj.group('id_2') +        display_id = mobj.group('display_id') or mobj.group('display_id_2') -        video_id = mobj.group('id') -        seo = mobj.group('seo') -        proto = mobj.group('proto') -        mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo) -        webpage = self._download_webpage(mrss_url, video_id) +        webpage = self._download_webpage(url, video_id)          error = self._html_search_regex(              r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -95,6 +102,39 @@ class XHamsterIE(InfoExtractor):               r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],              webpage, 'title') +        formats = [] +        format_urls = set() + +        sources = self._parse_json( +            self._search_regex( +                r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', +                default='{}'), +            video_id, fatal=False) +        for format_id, format_url in sources.items(): +            if not isinstance(format_url, compat_str): +                continue +            if format_url in format_urls: +                continue +            format_urls.add(format_url) +            formats.append({ +                'format_id': format_id, +                'url': format_url, +                'height': int_or_none(self._search_regex( +                    r'^(\d+)[pP]', format_id, 'height', default=None)) +            }) + +        video_url = self._search_regex( +            [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', +             r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', +             r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], +            webpage, 'video url', group='mp4', default=None) +        if video_url and video_url not in format_urls: +            formats.append({ +                'url': video_url, +            }) + +        self._sort_formats(formats) +          # Only a few videos have an description          mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)          description = mobj.group(1) if mobj else None @@ -104,7 +144,7 @@ class XHamsterIE(InfoExtractor):              webpage, 'upload date', fatal=False))          uploader = self._html_search_regex( -            r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P<uploader>.+?)</a>', +            r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)',              webpage, 'uploader', default='anonymous')          thumbnail = self._search_regex( @@ -113,14 +153,15 @@ class XHamsterIE(InfoExtractor):              webpage, 'thumbnail', fatal=False, group='thumbnail')          duration = parse_duration(self._search_regex( -            r'Runtime:\s*</span>\s*([\d:]+)', webpage, +            [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', +             r'Runtime:\s*</span>\s*([\d:]+)'], webpage,              'duration', fatal=False))          view_count = int_or_none(self._search_regex(              r'content=["\']User(?:View|Play)s:(\d+)',              webpage, 'view count', fatal=False)) -        mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage) +        mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage)          (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)          mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) @@ -128,32 +169,15 @@ class XHamsterIE(InfoExtractor):          age_limit = self._rta_search(webpage) -        hd = is_hd(webpage) - -        format_id = 'hd' if hd else 'sd' - -        video_url = extract_video_url(webpage, format_id) -        formats = [{ -            'url': video_url, -            'format_id': 'hd' if hd else 'sd', -            'preference': 1, -        }] - -        if not hd: -            mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') -            webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') -            if is_hd(webpage): -                video_url = extract_video_url(webpage, 'hd') -                formats.append({ -                    'url': video_url, -                    'format_id': 'hd', -                    'preference': 2, -                }) - -        self._sort_formats(formats) +        categories_html = self._search_regex( +            r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, +            'categories', default=None) +        categories = [clean_html(category) for category in re.findall( +            r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None          return {              'id': video_id, +            'display_id': display_id,              'title': title,              'description': description,              'upload_date': upload_date, @@ -165,6 +189,7 @@ class XHamsterIE(InfoExtractor):              'dislike_count': int_or_none(dislike_count),              'comment_count': int_or_none(comment_count),              'age_limit': age_limit, +            'categories': categories,              'formats': formats,          } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index e0818201a..0276c0dbb 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -1,14 +1,13 @@  # coding: utf-8  from __future__ import unicode_literals -import base64 -  from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote  from ..utils import (      ExtractorError, +    float_or_none, +    get_element_by_attribute,      parse_iso8601, -    parse_duration, +    remove_end,  ) @@ -24,6 +23,7 @@ class XuiteIE(InfoExtractor):              'id': '3860914',              'ext': 'mp3',              'title': '孤單南半球-歐德陽', +            'description': '孤單南半球-歐德陽',              'thumbnail': r're:^https?://.*\.jpg$',              'duration': 247.246,              'timestamp': 1314932940, @@ -44,7 +44,7 @@ class XuiteIE(InfoExtractor):              'duration': 596.458,              'timestamp': 1454242500,              'upload_date': '20160131', -            'uploader': 'yan12125', +            'uploader': '屁姥',              'uploader_id': '12158353',              'categories': ['個人短片'],              'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', @@ -72,10 +72,10 @@ class XuiteIE(InfoExtractor):          # from http://forgetfulbc.blogspot.com/2016/06/date.html          'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',          'info_dict': { -            'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', +            'id': '27447336',              'ext': 'mp4',              'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', -            'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', +            'description': 'md5:1223810fa123b179083a3aed53574706',              'timestamp': 1466160960,              'upload_date': '20160617',              'uploader': 'B.C. & Lowy', @@ -86,29 +86,9 @@ class XuiteIE(InfoExtractor):          'only_matching': True,      }] -    @staticmethod -    def base64_decode_utf8(data): -        return base64.b64decode(data.encode('utf-8')).decode('utf-8') - -    @staticmethod -    def base64_encode_utf8(data): -        return base64.b64encode(data.encode('utf-8')).decode('utf-8') - -    def _extract_flv_config(self, encoded_media_id): -        flv_config = self._download_xml( -            'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id, -            'flv config') -        prop_dict = {} -        for prop in flv_config.findall('./property'): -            prop_id = self.base64_decode_utf8(prop.attrib['id']) -            # CDATA may be empty in flv config -            if not prop.text: -                continue -            encoded_content = self.base64_decode_utf8(prop.text) -            prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) -        return prop_dict -      def _real_extract(self, url): +        # /play/ URLs provide embedded video URL and more metadata +        url = url.replace('/embed/', '/play/')          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) @@ -121,51 +101,53 @@ class XuiteIE(InfoExtractor):                  '%s returned error: %s' % (self.IE_NAME, error_msg),                  expected=True) -        encoded_media_id = self._search_regex( -            r'attributes\.name\s*=\s*"([^"]+)"', webpage, -            'encoded media id', default=None) -        if encoded_media_id is None: -            video_id = self._html_search_regex( -                r'data-mediaid="(\d+)"', webpage, 'media id') -            encoded_media_id = self.base64_encode_utf8(video_id) -        flv_config = self._extract_flv_config(encoded_media_id) +        media_info = self._parse_json(self._search_regex( +            r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id) -        FORMATS = { -            'audio': 'mp3', -            'video': 'mp4', -        } +        video_id = media_info['MEDIA_ID']          formats = [] -        for format_tag in ('src', 'hq_src'): -            video_url = flv_config.get(format_tag) +        for key in ('html5Url', 'html5HQUrl'): +            video_url = media_info.get(key)              if not video_url:                  continue              format_id = self._search_regex( -                r'\bq=(.+?)\b', video_url, 'format id', default=format_tag) +                r'\bq=(.+?)\b', video_url, 'format id', default=None)              formats.append({                  'url': video_url, -                'ext': FORMATS.get(flv_config['type'], 'mp4'), +                'ext': 'mp4' if format_id.isnumeric() else format_id,                  'format_id': format_id,                  'height': int(format_id) if format_id.isnumeric() else None,              })          self._sort_formats(formats) -        timestamp = flv_config.get('publish_datetime') +        timestamp = media_info.get('PUBLISH_DATETIME')          if timestamp:              timestamp = parse_iso8601(timestamp + ' +0800', ' ') -        category = flv_config.get('category') +        category = media_info.get('catName')          categories = [category] if category else [] +        uploader = media_info.get('NICKNAME') +        uploader_url = None + +        author_div = get_element_by_attribute('itemprop', 'author', webpage) +        if author_div: +            uploader = uploader or self._html_search_meta('name', author_div) +            uploader_url = self._html_search_regex( +                r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div, +                'uploader URL', fatal=False) +          return {              'id': video_id, -            'title': flv_config['title'], -            'description': flv_config.get('description'), -            'thumbnail': flv_config.get('thumb'), +            'title': media_info['TITLE'], +            'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'), +            'thumbnail': media_info.get('ogImageUrl'),              'timestamp': timestamp, -            'uploader': flv_config.get('author_name'), -            'uploader_id': flv_config.get('author_id'), -            'duration': parse_duration(flv_config.get('duration')), +            'uploader': uploader, +            'uploader_id': media_info.get('MEMBER_ID'), +            'uploader_url': uploader_url, +            'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000),              'categories': categories,              'formats': formats,          } diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py index 5c8f17eb2..e34ebe3a6 100644 --- a/youtube_dl/extractor/xxxymovies.py +++ b/youtube_dl/extractor/xxxymovies.py @@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor):              r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')          title = self._html_search_regex( -            [r'<div class="block_header">\s*<h1>([^<]+)</h1>', -             r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'], +            [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<', +             r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'],              webpage, 'title')          thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py deleted file mode 100644 index ef5535547..000000000 --- a/youtube_dl/extractor/yam.py +++ /dev/null @@ -1,123 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( -    float_or_none, -    month_by_abbreviation, -    ExtractorError, -    get_element_by_attribute, -) - - -class YamIE(InfoExtractor): -    IE_DESC = '蕃薯藤yam天空部落' -    _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)' - -    _TESTS = [{ -        # An audio hosted on Yam -        'url': 'http://mymedia.yam.com/m/2283921', -        'md5': 'c011b8e262a52d5473d9c2e3c9963b9c', -        'info_dict': { -            'id': '2283921', -            'ext': 'mp3', -            'title': '發現 - 趙薇 京華煙雲主題曲', -            'description': '發現 - 趙薇 京華煙雲主題曲', -            'uploader_id': 'princekt', -            'upload_date': '20080807', -            'duration': 313.0, -        } -    }, { -        # An external video hosted on YouTube -        'url': 'http://mymedia.yam.com/m/3599430', -        'md5': '03127cf10d8f35d120a9e8e52e3b17c6', -        'info_dict': { -            'id': 'CNpEoQlrIgA', -            'ext': 'mp4', -            'upload_date': '20150306', -            'uploader': '新莊社大瑜伽社', -            'description': 'md5:11e2e405311633ace874f2e6226c8b17', -            'uploader_id': '2323agoy', -            'title': '20090412陽明山二子坪-1', -        }, -        'skip': 'Video does not exist', -    }, { -        'url': 'http://mymedia.yam.com/m/3598173', -        'info_dict': { -            'id': '3598173', -            'ext': 'mp4', -        }, -        'skip': 'cause Yam system error', -    }, { -        'url': 'http://mymedia.yam.com/m/3599437', -        'info_dict': { -            'id': '3599437', -            'ext': 'mp4', -        }, -        'skip': 'invalid YouTube URL', -    }, { -        'url': 'http://mymedia.yam.com/m/2373534', -        'md5': '7ff74b91b7a817269d83796f8c5890b1', -        'info_dict': { -            'id': '2373534', -            'ext': 'mp3', -            'title': '林俊傑&蔡卓妍-小酒窩', -            'description': 'md5:904003395a0fcce6cfb25028ff468420', -            'upload_date': '20080928', -            'uploader_id': 'onliner2', -        } -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        page = self._download_webpage(url, video_id) - -        # Check for errors -        system_msg = self._html_search_regex( -            r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message', -            default=None) -        if system_msg: -            raise ExtractorError(system_msg, expected=True) - -        # Is it hosted externally on YouTube? -        youtube_url = self._html_search_regex( -            r'<embed src="(http://www.youtube.com/[^"]+)"', -            page, 'YouTube url', default=None) -        if youtube_url: -            return self.url_result(youtube_url, 'Youtube') - -        title = self._html_search_regex( -            r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title') - -        api_page = self._download_webpage( -            'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id, -            note='Downloading API page') -        api_result_obj = compat_urlparse.parse_qs(api_page) - -        info_table = get_element_by_attribute('class', 'info', page) -        uploader_id = self._html_search_regex( -            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"', -            info_table, 'uploader id', fatal=False) -        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' + -                         r'(?P<day>\d{1,2}), (?P<year>\d{4})', page) -        if mobj: -            upload_date = '%s%02d%02d' % ( -                mobj.group('year'), -                month_by_abbreviation(mobj.group('mon')), -                int(mobj.group('day'))) -        else: -            upload_date = None -        duration = float_or_none(api_result_obj['totaltime'][0], scale=1000) - -        return { -            'id': video_id, -            'url': api_result_obj['mp3file'][0], -            'title': title, -            'description': self._html_search_meta('description', page), -            'duration': duration, -            'uploader_id': uploader_id, -            'upload_date': upload_date, -        } diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py new file mode 100644 index 000000000..e8f6ae10f --- /dev/null +++ b/youtube_dl/extractor/yandexdisk.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    float_or_none, +    int_or_none, +    try_get, +    urlencode_postdata, +) + + +class YandexDiskIE(InfoExtractor): +    _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', +        'md5': '33955d7ae052f15853dc41f35f17581c', +        'info_dict': { +            'id': 'VdOeDou8eZs6Y', +            'ext': 'mp4', +            'title': '4.mp4', +            'duration': 168.6, +            'uploader': 'y.botova', +            'uploader_id': '300043621', +            'view_count': int, +        }, +    }, { +        'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        status = self._download_webpage( +            'https://disk.yandex.com/auth/status', video_id, query={ +                'urlOrigin': url, +                'source': 'public', +                'md5': 'false', +            }) + +        sk = self._search_regex( +            r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2', +            status, 'sk', group='value') + +        webpage = self._download_webpage(url, video_id) + +        models = self._parse_json( +            self._search_regex( +                r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script', +                webpage, 'video JSON'), +            video_id) + +        data = next( +            model['data'] for model in models +            if model.get('model') == 'resource') + +        video_hash = data['id'] +        title = data['name'] + +        models = self._download_json( +            'https://disk.yandex.com/models/', video_id, +            data=urlencode_postdata({ +                '_model.0': 'videoInfo', +                'id.0': video_hash, +                '_model.1': 'do-get-resource-url', +                'id.1': video_hash, +                'version': '13.6', +                'sk': sk, +            }), query={'_m': 'videoInfo'})['models'] + +        videos = try_get(models, lambda x: x[0]['data']['videos'], list) or [] +        source_url = try_get( +            models, lambda x: x[1]['data']['file'], compat_str) + +        formats = [] +        if source_url: +            formats.append({ +                'url': source_url, +                'format_id': 'source', +                'ext': determine_ext(title, 'mp4'), +                'quality': 1, +            }) +        for video in videos: +            format_url = video.get('url') +            if not format_url: +                continue +            if determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            else: +                formats.append({ +                    'url': format_url, +                }) +        self._sort_formats(formats) + +        duration = float_or_none(try_get( +            models, lambda x: x[0]['data']['duration']), 1000) +        uploader = try_get( +            data, lambda x: x['user']['display_name'], compat_str) +        uploader_id = try_get( +            data, lambda x: x['user']['uid'], compat_str) +        view_count = int_or_none(try_get( +            data, lambda x: x['meta']['views_counter'])) + +        return { +            'id': video_id, +            'title': title, +            'duration': duration, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'view_count': view_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index b50f34e9b..f33fabe19 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,39 +1,95 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    int_or_none, +    parse_duration, +)  class YouJizzIE(InfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' +    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'      _TESTS = [{          'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', -        'md5': '78fc1901148284c69af12640e01c6310', +        'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',          'info_dict': {              'id': '2189178',              'ext': 'mp4',              'title': 'Zeichentrick 1',              'age_limit': 18, +            'duration': 2874,          }      }, {          'url': 'http://www.youjizz.com/videos/-2189178.html',          'only_matching': True, +    }, { +        'url': 'https://www.youjizz.com/videos/embed/31991001', +        'only_matching': True,      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') or mobj.group('embed_id') +          webpage = self._download_webpage(url, video_id) -        # YouJizz's HTML5 player has invalid HTML -        webpage = webpage.replace('"controls', '" controls') -        age_limit = self._rta_search(webpage) -        video_title = self._html_search_regex( -            r'<title>\s*(.*)\s*</title>', webpage, 'title') -        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] +        title = self._html_search_regex( +            r'<title>(.+?)</title>', webpage, 'title') + +        formats = [] + +        encodings = self._parse_json( +            self._search_regex( +                r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', +                default='[]'), +            video_id, fatal=False) +        for encoding in encodings: +            if not isinstance(encoding, dict): +                continue +            format_url = encoding.get('filename') +            if not isinstance(format_url, compat_str): +                continue +            if determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            else: +                format_id = encoding.get('name') or encoding.get('quality') +                height = int_or_none(self._search_regex( +                    r'^(\d+)[pP]', format_id, 'height', default=None)) +                formats.append({ +                    'url': format_url, +                    'format_id': format_id, +                    'height': height, +                }) + +        if formats: +            info_dict = { +                'formats': formats, +            } +        else: +            # YouJizz's HTML5 player has invalid HTML +            webpage = webpage.replace('"controls', '" controls') +            info_dict = self._parse_html5_media_entries( +                url, webpage, video_id)[0] + +        duration = parse_duration(self._search_regex( +            r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', +            default=None)) +        uploader = self._search_regex( +            r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', +            default=None)          info_dict.update({              'id': video_id, -            'title': video_title, -            'age_limit': age_limit, +            'title': title, +            'age_limit': self._rta_search(webpage), +            'duration': duration, +            'uploader': uploader,          })          return info_dict diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 73ebe5759..0c4bc2eda 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,23 +1,18 @@  # coding: utf-8  from __future__ import unicode_literals -import base64 -import itertools  import random  import re  import string  import time  from .common import InfoExtractor -from ..compat import ( -    compat_ord, -    compat_str, -    compat_urllib_parse_urlencode, -)  from ..utils import (      ExtractorError, -    get_element_by_attribute, -    try_get, +    get_element_by_class, +    js_to_json, +    str_or_none, +    strip_jsonp,  ) @@ -26,7 +21,9 @@ class YoukuIE(InfoExtractor):      IE_DESC = '优酷'      _VALID_URL = r'''(?x)          (?: -            http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| +            https?://( +                (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| +                video\.tudou\.com/v/)|              youku:)          (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)      ''' @@ -35,9 +32,15 @@ class YoukuIE(InfoExtractor):          # MD5 is unstable          'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',          'info_dict': { -            'id': 'XMTc1ODE5Njcy_part1', +            'id': 'XMTc1ODE5Njcy',              'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', -            'ext': 'flv' +            'ext': 'mp4', +            'duration': 74.73, +            'thumbnail': r're:^https?://.*', +            'uploader': '。躲猫猫、', +            'uploader_id': '36017967', +            'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', +            'tags': list,          }      }, {          'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', @@ -46,25 +49,42 @@ class YoukuIE(InfoExtractor):          'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',          'info_dict': {              'id': 'XODgxNjg1Mzk2', +            'ext': 'mp4',              'title': '武媚娘传奇 85', +            'duration': 1999.61, +            'thumbnail': r're:^https?://.*', +            'uploader': '疯狂豆花', +            'uploader_id': '62583473', +            'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', +            'tags': list,          }, -        'playlist_count': 11, -        'skip': 'Available in China only',      }, {          'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',          'info_dict': {              'id': 'XMTI1OTczNDM5Mg', +            'ext': 'mp4',              'title': '花千骨 04', +            'duration': 2363, +            'thumbnail': r're:^https?://.*', +            'uploader': '放剧场-花千骨', +            'uploader_id': '772849359', +            'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', +            'tags': list,          }, -        'playlist_count': 13,      }, {          'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',          'note': 'Video protected with password',          'info_dict': {              'id': 'XNjA1NzA2Njgw', +            'ext': 'mp4',              'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', +            'duration': 7264.5, +            'thumbnail': r're:^https?://.*', +            'uploader': 'FoxJin1006', +            'uploader_id': '322014285', +            'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', +            'tags': list,          }, -        'playlist_count': 19,          'params': {              'videopassword': '100600',          }, @@ -73,130 +93,38 @@ class YoukuIE(InfoExtractor):          'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',          'info_dict': {              'id': 'XOTUxMzg4NDMy', +            'ext': 'mp4',              'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', +            'duration': 702.08, +            'thumbnail': r're:^https?://.*', +            'uploader': '明月庄主moon', +            'uploader_id': '38465621', +            'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', +            'tags': list, +        }, +    }, { +        'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', +        'info_dict': { +            'id': 'XMjIyNzAzMTQ4NA', +            'ext': 'mp4', +            'title': '卡马乔国足开大脚长传冲吊集锦', +            'duration': 289, +            'thumbnail': r're:^https?://.*', +            'uploader': '阿卜杜拉之星', +            'uploader_id': '2382249', +            'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', +            'tags': list,          }, -        'playlist_count': 6, +    }, { +        'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', +        'only_matching': True,      }] -    def construct_video_urls(self, data): -        # get sid, token -        def yk_t(s1, s2): -            ls = list(range(256)) -            t = 0 -            for i in range(256): -                t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256 -                ls[i], ls[t] = ls[t], ls[i] -            s = bytearray() -            x, y = 0, 0 -            for i in range(len(s2)): -                y = (y + 1) % 256 -                x = (x + ls[y]) % 256 -                ls[x], ls[y] = ls[y], ls[x] -                s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) -            return bytes(s) - -        sid, token = yk_t( -            b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii')) -        ).decode('ascii').split('_') - -        # get oip -        oip = data['security']['ip'] - -        fileid_dict = {} -        for stream in data['stream']: -            if stream.get('channel_type') == 'tail': -                continue -            format = stream.get('stream_type') -            fileid = try_get( -                stream, lambda x: x['segs'][0]['fileid'], -                compat_str) or stream['stream_fileid'] -            fileid_dict[format] = fileid - -        def get_fileid(format, n): -            number = hex(int(str(n), 10))[2:].upper() -            if len(number) == 1: -                number = '0' + number -            streamfileids = fileid_dict[format] -            fileid = streamfileids[0:8] + number + streamfileids[10:] -            return fileid - -        # get ep -        def generate_ep(format, n): -            fileid = get_fileid(format, n) -            ep_t = yk_t( -                b'bf7e5f01', -                ('%s_%s_%s' % (sid, fileid, token)).encode('ascii') -            ) -            ep = base64.b64encode(ep_t).decode('ascii') -            return ep - -        # generate video_urls -        video_urls_dict = {} -        for stream in data['stream']: -            if stream.get('channel_type') == 'tail': -                continue -            format = stream.get('stream_type') -            video_urls = [] -            for dt in stream['segs']: -                n = str(stream['segs'].index(dt)) -                param = { -                    'K': dt['key'], -                    'hd': self.get_hd(format), -                    'myp': 0, -                    'ypp': 0, -                    'ctype': 12, -                    'ev': 1, -                    'token': token, -                    'oip': oip, -                    'ep': generate_ep(format, n) -                } -                video_url = \ -                    'http://k.youku.com/player/getFlvPath/' + \ -                    'sid/' + sid + \ -                    '_00' + \ -                    '/st/' + self.parse_ext_l(format) + \ -                    '/fileid/' + get_fileid(format, n) + '?' + \ -                    compat_urllib_parse_urlencode(param) -                video_urls.append(video_url) -            video_urls_dict[format] = video_urls - -        return video_urls_dict -      @staticmethod      def get_ysuid():          return '%d%s' % (int(time.time()), ''.join([              random.choice(string.ascii_letters) for i in range(3)])) -    def get_hd(self, fm): -        hd_id_dict = { -            '3gp': '0', -            '3gphd': '1', -            'flv': '0', -            'flvhd': '0', -            'mp4': '1', -            'mp4hd': '1', -            'mp4hd2': '1', -            'mp4hd3': '1', -            'hd2': '2', -            'hd3': '3', -        } -        return hd_id_dict[fm] - -    def parse_ext_l(self, fm): -        ext_dict = { -            '3gp': 'flv', -            '3gphd': 'mp4', -            'flv': 'flv', -            'flvhd': 'flv', -            'mp4': 'mp4', -            'mp4hd': 'mp4', -            'mp4hd2': 'flv', -            'mp4hd3': 'flv', -            'hd2': 'flv', -            'hd3': 'flv', -        } -        return ext_dict[fm] -      def get_format_name(self, fm):          _dict = {              '3gp': 'h6', @@ -210,32 +138,40 @@ class YoukuIE(InfoExtractor):              'hd2': 'h2',              'hd3': 'h1',          } -        return _dict[fm] +        return _dict.get(fm)      def _real_extract(self, url):          video_id = self._match_id(url)          self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) +        self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') -        def retrieve_data(req_url, note): -            headers = { -                'Referer': req_url, -            } -            headers.update(self.geo_verification_headers()) -            self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') +        _, urlh = self._download_webpage_handle( +            'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info') +        # The etag header is '"foobar"'; let's remove the double quotes +        cna = urlh.headers['etag'][1:-1] -            raw_data = self._download_json(req_url, video_id, note=note, headers=headers) - -            return raw_data['data'] +        # request basic data +        basic_data_params = { +            'vid': video_id, +            'ccode': '0402' if 'tudou.com' in url else '0401', +            'client_ip': '192.168.1.1', +            'utid': cna, +            'client_ts': time.time() / 1000, +        }          video_password = self._downloader.params.get('videopassword') - -        # request basic data -        basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id          if video_password: -            basic_data_url += '&pwd=%s' % video_password +            basic_data_params['password'] = video_password -        data = retrieve_data(basic_data_url, 'Downloading JSON metadata') +        headers = { +            'Referer': url, +        } +        headers.update(self.geo_verification_headers()) +        data = self._download_json( +            'https://ups.youku.com/ups/get.json', video_id, +            'Downloading JSON metadata', +            query=basic_data_params, headers=headers)['data']          error = data.get('error')          if error: @@ -253,86 +189,111 @@ class YoukuIE(InfoExtractor):                  raise ExtractorError(msg)          # get video title -        title = data['video']['title'] - -        # generate video_urls_dict -        video_urls_dict = self.construct_video_urls(data) - -        # construct info -        entries = [{ -            'id': '%s_part%d' % (video_id, i + 1), -            'title': title, -            'formats': [], -            # some formats are not available for all parts, we have to detect -            # which one has all -        } for i in range(max(len(v.get('segs')) for v in data['stream']))] -        for stream in data['stream']: -            if stream.get('channel_type') == 'tail': -                continue -            fm = stream.get('stream_type') -            video_urls = video_urls_dict[fm] -            for video_url, seg, entry in zip(video_urls, stream['segs'], entries): -                entry['formats'].append({ -                    'url': video_url, -                    'format_id': self.get_format_name(fm), -                    'ext': self.parse_ext_l(fm), -                    'filesize': int(seg['size']), -                    'width': stream.get('width'), -                    'height': stream.get('height'), -                }) +        video_data = data['video'] +        title = video_data['title'] + +        formats = [{ +            'url': stream['m3u8_url'], +            'format_id': self.get_format_name(stream.get('stream_type')), +            'ext': 'mp4', +            'protocol': 'm3u8_native', +            'filesize': int(stream.get('size')), +            'width': stream.get('width'), +            'height': stream.get('height'), +        } for stream in data['stream'] if stream.get('channel_type') != 'tail'] +        self._sort_formats(formats)          return { -            '_type': 'multi_video',              'id': video_id,              'title': title, -            'entries': entries, +            'formats': formats, +            'duration': video_data.get('seconds'), +            'thumbnail': video_data.get('logo'), +            'uploader': video_data.get('username'), +            'uploader_id': str_or_none(video_data.get('userid')), +            'uploader_url': data.get('uploader', {}).get('homepage'), +            'tags': video_data.get('tags'),          }  class YoukuShowIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html' +    _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'      IE_NAME = 'youku:show' -    _TEST = { -        'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html', +    _TESTS = [{ +        'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',          'info_dict': {              'id': 'zc7c670be07ff11e48b3f', -            'title': '花千骨 未删减版', -            'description': 'md5:578d4f2145ae3f9128d9d4d863312910', +            'title': '花千骨 DVD版', +            'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',          },          'playlist_count': 50, -    } - -    _PAGE_SIZE = 40 +    }, { +        # Episode number not starting from 1 +        'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', +        'info_dict': { +            'id': 'zefbfbd70efbfbd780bef', +            'title': '超级飞侠3', +            'description': 'md5:275715156abebe5ccc2a1992e9d56b98', +        }, +        'playlist_count': 24, +    }, { +        # Ongoing playlist. The initial page is the last one +        'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', +        'only_matchine': True, +    }] -    def _find_videos_in_page(self, webpage): -        videos = re.findall( -            r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage) -        return [ -            self.url_result(video_url, YoukuIE.ie_key(), title) -            for video_url, title in videos] +    def _extract_entries(self, playlist_data_url, show_id, note, query): +        query['callback'] = 'cb' +        playlist_data = self._download_json( +            playlist_data_url, show_id, query=query, note=note, +            transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] +        drama_list = (get_element_by_class('p-drama-grid', playlist_data) or +                      get_element_by_class('p-drama-half-row', playlist_data)) +        if drama_list is None: +            raise ExtractorError('No episodes found') +        video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list) +        return playlist_data, [ +            self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) +            for video_url in video_urls]      def _real_extract(self, url):          show_id = self._match_id(url)          webpage = self._download_webpage(url, show_id) -        entries = self._find_videos_in_page(webpage) - -        playlist_title = self._html_search_regex( -            r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False) -        detail_div = get_element_by_attribute('class', 'detail', webpage) or '' -        playlist_description = self._html_search_regex( -            r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>', -            detail_div, 'playlist description', fatal=False) - -        for idx in itertools.count(1): -            episodes_page = self._download_webpage( -                'http://www.youku.com/show_episode/id_%s.html' % show_id, -                show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)}, -                note='Downloading episodes page %d' % idx) -            new_entries = self._find_videos_in_page(episodes_page) +        entries = [] +        page_config = self._parse_json(self._search_regex( +            r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), +            show_id, transform_source=js_to_json) +        first_page, initial_entries = self._extract_entries( +            'http://list.youku.com/show/module', show_id, +            note='Downloading initial playlist data page', +            query={ +                'id': page_config['showid'], +                'tab': 'showInfo', +            }) +        first_page_reload_id = self._html_search_regex( +            r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id') +        # The first reload_id has the same items as first_page +        reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) +        for idx, reload_id in enumerate(reload_ids): +            if reload_id == first_page_reload_id: +                entries.extend(initial_entries) +                continue +            _, new_entries = self._extract_entries( +                'http://list.youku.com/show/episode', show_id, +                note='Downloading playlist data page %d' % (idx + 1), +                query={ +                    'id': page_config['showid'], +                    'stage': reload_id, +                })              entries.extend(new_entries) -            if len(new_entries) < self._PAGE_SIZE: -                break -        return self.playlist_result(entries, show_id, playlist_title, playlist_description) +        desc = self._html_search_meta('description', webpage, fatal=False) +        playlist_title = desc.split(',')[0] if desc else None +        detail_li = get_element_by_class('p-intro', webpage) +        playlist_description = get_element_by_class( +            'intro-more', detail_li) if detail_li else None + +        return self.playlist_result( +            entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 34ab878a4..547adefeb 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      int_or_none,      sanitized_Request, @@ -26,7 +27,7 @@ class YouPornIE(InfoExtractor):              'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': 'Ask Dan And Jennifer', -            'upload_date': '20101221', +            'upload_date': '20101217',              'average_rating': int,              'view_count': int,              'comment_count': int, @@ -45,7 +46,7 @@ class YouPornIE(InfoExtractor):              'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': 'Unknown', -            'upload_date': '20111125', +            'upload_date': '20110418',              'average_rating': int,              'view_count': int,              'comment_count': int, @@ -68,28 +69,46 @@ class YouPornIE(InfoExtractor):          webpage = self._download_webpage(request, display_id)          title = self._search_regex( -            [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1', -             r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], -            webpage, 'title', group='title') +            [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1', +             r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'], +            webpage, 'title', group='title', +            default=None) or self._og_search_title( +            webpage, default=None) or self._html_search_meta( +            'title', webpage, fatal=True)          links = [] +        # Main source +        definitions = self._parse_json( +            self._search_regex( +                r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, +                'media definitions', default='[]'), +            video_id, fatal=False) +        if definitions: +            for definition in definitions: +                if not isinstance(definition, dict): +                    continue +                video_url = definition.get('videoUrl') +                if isinstance(video_url, compat_str) and video_url: +                    links.append(video_url) + +        # Fallback #1, this also contains extra low quality 180p format +        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): +            links.append(link) + +        # Fallback #2 (unavailable as at 22.06.2017)          sources = self._search_regex(              r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)          if sources:              for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):                  links.append(link) -        # Fallback #1 +        # Fallback #3 (unavailable as at 22.06.2017)          for _, link in re.findall( -                r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): -            links.append(link) - -        # Fallback #2, this also contains extra low quality 180p format -        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): +                r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):              links.append(link) -        # Fallback #3, encrypted links +        # Fallback #4, encrypted links (unavailable as at 22.06.2017)          for _, encrypted_link in re.findall(                  r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):              links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) @@ -124,7 +143,8 @@ class YouPornIE(InfoExtractor):              r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',              webpage, 'uploader', fatal=False)          upload_date = unified_strdate(self._html_search_regex( -            r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>', +            [r'Date\s+[Aa]dded:\s*<span>([^<]+)', +             r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],              webpage, 'upload date', fatal=False))          age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 480f403da..ad2e933ee 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter  from ..swfinterp import SWFInterpreter  from ..compat import (      compat_chr, +    compat_kwargs,      compat_parse_qs,      compat_urllib_parse_unquote,      compat_urllib_parse_unquote_plus, @@ -38,7 +39,6 @@ from ..utils import (      parse_duration,      remove_quotes,      remove_start, -    sanitized_Request,      smuggle_url,      str_to_int,      try_get, @@ -54,7 +54,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      """Provide base functions for Youtube extractors"""      _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'      _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' -    _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + +    _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' +    _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' +    _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' +      _NETRC_MACHINE = 'youtube'      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False @@ -96,74 +100,157 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          login_form = self._hidden_inputs(login_page) -        login_form.update({ -            'checkConnection': 'youtube', -            'Email': username, -            'Passwd': password, -        }) +        def req(url, f_req, note, errnote): +            data = login_form.copy() +            data.update({ +                'pstMsg': 1, +                'checkConnection': 'youtube', +                'checkedDomains': 'youtube', +                'hl': 'en', +                'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', +                'f.req': json.dumps(f_req), +                'flowName': 'GlifWebSignIn', +                'flowEntry': 'ServiceLogin', +            }) +            return self._download_json( +                url, None, note=note, errnote=errnote, +                transform_source=lambda s: re.sub(r'^[^[]*', '', s), +                fatal=False, +                data=urlencode_postdata(data), headers={ +                    'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', +                    'Google-Accounts-XSRF': 1, +                }) -        login_results = self._download_webpage( -            self._PASSWORD_CHALLENGE_URL, None, -            note='Logging in', errnote='unable to log in', fatal=False, -            data=urlencode_postdata(login_form)) -        if login_results is False: -            return False +        def warn(message): +            self._downloader.report_warning(message) + +        lookup_req = [ +            username, +            None, [], None, 'US', None, None, 2, False, True, +            [ +                None, None, +                [2, 1, None, 1, +                 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', +                 None, [], 4], +                1, [None, None, []], None, None, None, True +            ], +            username, +        ] -        error_msg = self._html_search_regex( -            r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', -            login_results, 'error message', default=None) -        if error_msg: -            raise ExtractorError('Unable to login: %s' % error_msg, expected=True) +        lookup_results = req( +            self._LOOKUP_URL, lookup_req, +            'Looking up account info', 'Unable to look up account info') -        if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: -            raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) +        if lookup_results is False: +            return False -        # Two-Factor -        # TODO add SMS and phone call support - these require making a request and then prompting the user +        user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) +        if not user_hash: +            warn('Unable to extract user hash') +            return False -        if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: -            tfa_code = self._get_tfa_info('2-step verification code') +        challenge_req = [ +            user_hash, +            None, 1, None, [1, None, None, None, [password, None, True]], +            [ +                None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], +                1, [None, None, []], None, None, None, True +            ]] -            if not tfa_code: -                self._downloader.report_warning( -                    'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' -                    '(Note that only TOTP (Google Authenticator App) codes work at this time.)') -                return False +        challenge_results = req( +            self._CHALLENGE_URL, challenge_req, +            'Logging in', 'Unable to log in') -            tfa_code = remove_start(tfa_code, 'G-') +        if challenge_results is False: +            return -            tfa_form_strs = self._form_hidden_inputs('challenge', login_results) +        login_res = try_get(challenge_results, lambda x: x[0][5], list) +        if login_res: +            login_msg = try_get(login_res, lambda x: x[5], compat_str) +            warn( +                'Unable to login: %s' % 'Invalid password' +                if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) +            return False -            tfa_form_strs.update({ -                'Pin': tfa_code, -                'TrustDevice': 'on', -            }) +        res = try_get(challenge_results, lambda x: x[0][-1], list) +        if not res: +            warn('Unable to extract result entry') +            return False -            tfa_data = urlencode_postdata(tfa_form_strs) +        tfa = try_get(res, lambda x: x[0][0], list) +        if tfa: +            tfa_str = try_get(tfa, lambda x: x[2], compat_str) +            if tfa_str == 'TWO_STEP_VERIFICATION': +                # SEND_SUCCESS - TFA code has been successfully sent to phone +                # QUOTA_EXCEEDED - reached the limit of TFA codes +                status = try_get(tfa, lambda x: x[5], compat_str) +                if status == 'QUOTA_EXCEEDED': +                    warn('Exceeded the limit of TFA codes, try later') +                    return False + +                tl = try_get(challenge_results, lambda x: x[1][2], compat_str) +                if not tl: +                    warn('Unable to extract TL') +                    return False + +                tfa_code = self._get_tfa_info('2-step verification code') + +                if not tfa_code: +                    warn( +                        'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' +                        '(Note that only TOTP (Google Authenticator App) codes work at this time.)') +                    return False + +                tfa_code = remove_start(tfa_code, 'G-') + +                tfa_req = [ +                    user_hash, None, 2, None, +                    [ +                        9, None, None, None, None, None, None, None, +                        [None, tfa_code, True, 2] +                    ]] + +                tfa_results = req( +                    self._TFA_URL.format(tl), tfa_req, +                    'Submitting TFA code', 'Unable to submit TFA code') + +                if tfa_results is False: +                    return False + +                tfa_res = try_get(tfa_results, lambda x: x[0][5], list) +                if tfa_res: +                    tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) +                    warn( +                        'Unable to finish TFA: %s' % 'Invalid TFA code' +                        if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) +                    return False + +                check_cookie_url = try_get( +                    tfa_results, lambda x: x[0][-1][2], compat_str) +        else: +            check_cookie_url = try_get(res, lambda x: x[2], compat_str) -            tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) -            tfa_results = self._download_webpage( -                tfa_req, None, -                note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) +        if not check_cookie_url: +            warn('Unable to extract CheckCookie URL') +            return False -            if tfa_results is False: -                return False +        check_cookie_results = self._download_webpage( +            check_cookie_url, None, 'Checking cookie', fatal=False) -            if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: -                self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') -                return False -            if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: -                self._downloader.report_warning('unable to log in - did the page structure change?') -                return False -            if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: -                self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') -                return False +        if check_cookie_results is False: +            return False -        if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: -            self._downloader.report_warning('unable to log in: bad username or password') +        if 'https://myaccount.google.com/' not in check_cookie_results: +            warn('Unable to log in')              return False +          return True +    def _download_webpage(self, *args, **kwargs): +        kwargs.setdefault('query', {})['disable_polymer'] = 'true' +        return super(YoutubeBaseInfoExtractor, self)._download_webpage( +            *args, **compat_kwargs(kwargs)) +      def _real_initialize(self):          if self._downloader is None:              return @@ -592,6 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) +        # YouTube Red ad is not captured for creator          {              'url': '__2ABJjxzNo',              'info_dict': { @@ -922,6 +1010,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              ],          },          { +            # The following content has been identified by the YouTube community +            # as inappropriate or offensive to some audiences. +            'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', +            'info_dict': { +                'id': '6SJNVb0GnPI', +                'ext': 'mp4', +                'title': 'Race Differences in Intelligence', +                'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', +                'duration': 965, +                'upload_date': '20140124', +                'uploader': 'New Century Foundation', +                'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', +                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', +                'license': 'Standard YouTube License', +                'view_count': int, +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        {              # itag 212              'url': '1t24XAntNCY',              'only_matching': True, @@ -1188,37 +1297,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      sub_lang_list[sub_lang] = sub_formats                  return sub_lang_list +            def make_captions(sub_url, sub_langs): +                parsed_sub_url = compat_urllib_parse_urlparse(sub_url) +                caption_qs = compat_parse_qs(parsed_sub_url.query) +                captions = {} +                for sub_lang in sub_langs: +                    sub_formats = [] +                    for ext in self._SUBTITLE_FORMATS: +                        caption_qs.update({ +                            'tlang': [sub_lang], +                            'fmt': [ext], +                        }) +                        sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( +                            query=compat_urllib_parse_urlencode(caption_qs, True))) +                        sub_formats.append({ +                            'url': sub_url, +                            'ext': ext, +                        }) +                    captions[sub_lang] = sub_formats +                return captions + +            # New captions format as of 22.06.2017 +            player_response = args.get('player_response') +            if player_response and isinstance(player_response, compat_str): +                player_response = self._parse_json( +                    player_response, video_id, fatal=False) +                if player_response: +                    renderer = player_response['captions']['playerCaptionsTracklistRenderer'] +                    base_url = renderer['captionTracks'][0]['baseUrl'] +                    sub_lang_list = [] +                    for lang in renderer['translationLanguages']: +                        lang_code = lang.get('languageCode') +                        if lang_code: +                            sub_lang_list.append(lang_code) +                    return make_captions(base_url, sub_lang_list) +              # Some videos don't provide ttsurl but rather caption_tracks and              # caption_translation_languages (e.g. 20LmZk1hakA) +            # Does not used anymore as of 22.06.2017              caption_tracks = args['caption_tracks']              caption_translation_languages = args['caption_translation_languages']              caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] -            parsed_caption_url = compat_urllib_parse_urlparse(caption_url) -            caption_qs = compat_parse_qs(parsed_caption_url.query) - -            sub_lang_list = {} +            sub_lang_list = []              for lang in caption_translation_languages.split(','):                  lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))                  sub_lang = lang_qs.get('lc', [None])[0] -                if not sub_lang: -                    continue -                sub_formats = [] -                for ext in self._SUBTITLE_FORMATS: -                    caption_qs.update({ -                        'tlang': [sub_lang], -                        'fmt': [ext], -                    }) -                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( -                        query=compat_urllib_parse_urlencode(caption_qs, True))) -                    sub_formats.append({ -                        'url': sub_url, -                        'ext': ext, -                    }) -                sub_lang_list[sub_lang] = sub_formats -            return sub_lang_list +                if sub_lang: +                    sub_lang_list.append(sub_lang) +            return make_captions(caption_url, sub_lang_list)          # An extractor error can be raise by the download process if there are          # no automatic captions but there are subtitles -        except (KeyError, ExtractorError): +        except (KeyError, IndexError, ExtractorError):              self._downloader.report_warning(err_msg)              return {} @@ -1245,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              playback_url, video_id, 'Marking watched',              'Unable to mark watched', fatal=False) +    @staticmethod +    def _extract_urls(webpage): +        # Embedded YouTube player +        entries = [ +            unescapeHTML(mobj.group('url')) +            for mobj in re.finditer(r'''(?x) +            (?: +                <iframe[^>]+?src=| +                data-video-url=| +                <embed[^>]+?src=| +                embedSWF\(?:\s*| +                <object[^>]+data=| +                new\s+SWFObject\( +            ) +            (["\']) +                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ +                (?:embed|v|p)/.+?) +            \1''', webpage)] + +        # lazyYT YouTube embed +        entries.extend(list(map( +            unescapeHTML, +            re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + +        # Wordpress "YouTube Video Importer" plugin +        matches = re.findall(r'''(?x)<div[^>]+ +            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ +            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) +        entries.extend(m[-1] for m in matches) + +        return entries + +    @staticmethod +    def _extract_url(webpage): +        urls = YoutubeIE._extract_urls(webpage) +        return urls[0] if urls else None +      @classmethod      def extract_id(cls, url):          mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1257,6 +1423,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id          return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') +    @staticmethod +    def _extract_chapters(description, duration): +        if not description: +            return None +        chapter_lines = re.findall( +            r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', +            description) +        if not chapter_lines: +            return None +        chapters = [] +        for next_num, (chapter_line, time_point) in enumerate( +                chapter_lines, start=1): +            start_time = parse_duration(time_point) +            if start_time is None: +                continue +            if start_time > duration: +                break +            end_time = (duration if next_num == len(chapter_lines) +                        else parse_duration(chapter_lines[next_num][1])) +            if end_time is None: +                continue +            if end_time > duration: +                end_time = duration +            if start_time > end_time: +                break +            chapter_title = re.sub( +                r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') +            chapter_title = re.sub(r'\s+', ' ', chapter_title) +            chapters.append({ +                'start_time': start_time, +                'end_time': end_time, +                'title': chapter_title, +            }) +        return chapters +      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -1300,9 +1501,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              if dash_mpd and dash_mpd[0] not in dash_mpds:                  dash_mpds.append(dash_mpd[0]) +        is_live = None +        view_count = None + +        def extract_view_count(v_info): +            return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) +          # Get video info          embed_webpage = None -        is_live = None          if re.search(r'player-age-gate-content">', video_webpage) is not None:              age_gate = True              # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1325,6 +1531,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              age_gate = False              video_info = None +            sts = None              # Try looking directly into the video webpage              ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)              if ytplayer_config: @@ -1341,6 +1548,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                          args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])                  if args.get('livestream') == '1' or args.get('live_playback') == 1:                      is_live = True +                sts = ytplayer_config.get('sts')              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):                  # We also try looking in get_video_info since it may contain different dashmpd                  # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1349,17 +1557,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  # The general idea is to take a union of itags of both DASH manifests (for example                  # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)                  self.report_video_info_webpage_download(video_id) -                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: -                    video_info_url = ( -                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' -                        % (proto, video_id, el_type)) +                for el in ('info', 'embedded', 'detailpage', 'vevo', ''): +                    query = { +                        'video_id': video_id, +                        'ps': 'default', +                        'eurl': '', +                        'gl': 'US', +                        'hl': 'en', +                    } +                    if el: +                        query['el'] = el +                    if sts: +                        query['sts'] = sts                      video_info_webpage = self._download_webpage( -                        video_info_url, +                        '%s://www.youtube.com/get_video_info' % proto,                          video_id, note=False, -                        errnote='unable to download video info webpage') +                        errnote='unable to download video info webpage', +                        fatal=False, query=query) +                    if not video_info_webpage: +                        continue                      get_video_info = compat_parse_qs(video_info_webpage) -                    if get_video_info.get('use_cipher_signature') != ['True']: -                        add_dash_mpd(get_video_info) +                    add_dash_mpd(get_video_info) +                    if view_count is None: +                        view_count = extract_view_count(get_video_info)                      if not video_info:                          video_info = get_video_info                      if 'token' in get_video_info: @@ -1399,9 +1619,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_title = '_'          # description -        video_description = get_element_by_id("eow-description", video_webpage) +        description_original = video_description = get_element_by_id("eow-description", video_webpage)          if video_description: -            video_description = re.sub(r'''(?x) +            description_original = video_description = re.sub(r'''(?x)                  <a\s+                      (?:[a-zA-Z-]+="[^"]*"\s+)*?                      (?:title|href)="([^"]+)"\s+ @@ -1443,10 +1663,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  return self.playlist_result(entries, video_id, video_title, video_description)              self.to_screen('Downloading just video %s because of --no-playlist' % video_id) -        if 'view_count' in video_info: -            view_count = int(video_info['view_count'][0]) -        else: -            view_count = None +        if view_count is None: +            view_count = extract_view_count(video_info)          # Check for "rental" videos          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1490,10 +1708,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          if not upload_date:              upload_date = self._search_regex(                  [r'(?s)id="eow-date.*?>(.*?)</span>', -                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], +                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],                  video_webpage, 'upload date', default=None) -            if upload_date: -                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())          upload_date = unified_strdate(upload_date)          video_license = self._html_search_regex( @@ -1501,7 +1717,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_webpage, 'license', default=None)          m_music = re.search( -            r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', +            r'''(?x) +                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* +                <ul[^>]*>\s* +                <li>(?P<title>.+?) +                by (?P<creator>.+?) +                (?: +                    \(.+?\)| +                    <a[^>]* +                        (?: +                            \bhref=["\']/red[^>]*>|             # drop possible +                            >\s*Listen ad-free with YouTube Red # YouTube Red ad +                        ) +                    .*? +                )?</li +            ''',              video_webpage)          if m_music:              video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) @@ -1558,6 +1788,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          if self._downloader.params.get('writeannotations', False):              video_annotations = self._extract_annotations(video_id) +        chapters = self._extract_chapters(description_original, video_duration) +          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):              self.report_rtmp_download()              formats = [{ @@ -1591,12 +1823,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  format_id = url_data['itag'][0]                  url = url_data['url'][0] -                if 'sig' in url_data: -                    url += '&signature=' + url_data['sig'][0] -                elif 's' in url_data: -                    encrypted_sig = url_data['s'][0] +                if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):                      ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' -                      jsplayer_url_json = self._search_regex(                          ASSETS_RE,                          embed_webpage if age_gate else video_webpage, @@ -1617,6 +1845,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                              video_webpage, 'age gate player URL')                          player_url = json.loads(player_url_json) +                if 'sig' in url_data: +                    url += '&signature=' + url_data['sig'][0] +                elif 's' in url_data: +                    encrypted_sig = url_data['s'][0] +                      if self._downloader.params.get('verbose'):                          if player_url is None:                              player_version = 'unknown' @@ -1790,6 +2023,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'duration': video_duration,              'age_limit': 18 if age_gate else 0,              'annotations': video_annotations, +            'chapters': chapters,              'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,              'view_count': view_count,              'like_count': like_count, @@ -1861,7 +2095,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):                       |                          (%(playlist_id)s)                       )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} -    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' +    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'      _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'      IE_NAME = 'youtube:playlist'      _TESTS = [{ diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 24cdec28c..7bda59610 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -6,6 +6,7 @@ import re  from .utils import (      ExtractorError, +    remove_quotes,  )  _OPERATORS = [ @@ -57,7 +58,6 @@ class JSInterpreter(object):      def interpret_expression(self, expr, local_vars, allow_recursion):          expr = expr.strip() -          if expr == '':  # Empty expression              return None @@ -121,11 +121,19 @@ class JSInterpreter(object):              pass          m = re.match( -            r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, +            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) +        if m: +            val = local_vars[m.group('in')] +            idx = self.interpret_expression( +                m.group('idx'), local_vars, allow_recursion - 1) +            return val[idx] + +        m = re.match( +            r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,              expr)          if m:              variable = m.group('var') -            member = m.group('member') +            member = remove_quotes(m.group('member') or m.group('member2'))              arg_str = m.group('args')              if variable in local_vars: @@ -173,14 +181,6 @@ class JSInterpreter(object):              return obj[member](argvals) -        m = re.match( -            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) -        if m: -            val = local_vars[m.group('in')] -            idx = self.interpret_expression( -                m.group('idx'), local_vars, allow_recursion - 1) -            return val[idx] -          for op, opfunc in _OPERATORS:              m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)              if not m: @@ -211,21 +211,25 @@ class JSInterpreter(object):          raise ExtractorError('Unsupported JS expression %r' % expr)      def extract_object(self, objname): +        _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''          obj = {}          obj_m = re.search( -            (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) + -            r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + -            r'\}\s*;', +            r'''(?x) +                (?<!this\.)%s\s*=\s*{\s* +                    (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) +                }\s*; +            ''' % (re.escape(objname), _FUNC_NAME_RE),              self.code)          fields = obj_m.group('fields')          # Currently, it only supports function definitions          fields_m = re.finditer( -            r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function' -            r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', +            r'''(?x) +                (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} +            ''' % _FUNC_NAME_RE,              fields)          for f in fields_m:              argnames = f.group('args').split(',') -            obj[f.group('key')] = self.build_function(argnames, f.group('code')) +            obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))          return obj diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 52309fb84..38439c971 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -20,6 +20,24 @@ from .utils import (  from .version import __version__ +def _hide_login_info(opts): +    PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) +    eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + +    def _scrub_eq(o): +        m = eqre.match(o) +        if m: +            return m.group('key') + '=PRIVATE' +        else: +            return o + +    opts = list(map(_scrub_eq, opts)) +    for idx, opt in enumerate(opts): +        if opt in PRIVATE_OPTS and idx + 1 < len(opts): +            opts[idx + 1] = 'PRIVATE' +    return opts + +  def parseOpts(overrideArguments=None):      def _readOptions(filename_bytes, default=[]):          try: @@ -93,26 +111,6 @@ def parseOpts(overrideArguments=None):      def _comma_separated_values_options_callback(option, opt_str, value, parser):          setattr(parser.values, option.dest, value.split(',')) -    def _hide_login_info(opts): -        PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'] -        eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') - -        def _scrub_eq(o): -            m = eqre.match(o) -            if m: -                return m.group('key') + '=PRIVATE' -            else: -                return o - -        opts = list(map(_scrub_eq, opts)) -        for private_opt in PRIVATE_OPTS: -            try: -                i = opts.index(private_opt) -                opts[i + 1] = 'PRIVATE' -            except ValueError: -                pass -        return opts -      # No need to wrap help messages if we're on a wide console      columns = compat_get_terminal_size().columns      max_width = columns if columns else 80 @@ -310,7 +308,7 @@ def parseOpts(overrideArguments=None):          metavar='FILTER', dest='match_filter', default=None,          help=(              'Generic video filter. ' -            'Specify any key (see help for -o for a list of available keys) to ' +            'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to '              'match if the key is present, '              '!key to check if the key is not present, '              'key > NUMBER (like "comment_count > 12", also works with ' @@ -618,7 +616,7 @@ def parseOpts(overrideArguments=None):      verbosity.add_option(          '-j', '--dump-json',          action='store_true', dest='dumpjson', default=False, -        help='Simulate, quiet but print JSON information. See --output for a description of available keys.') +        help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.')      verbosity.add_option(          '-J', '--dump-single-json',          action='store_true', dest='dump_single_json', default=False, @@ -814,11 +812,12 @@ def parseOpts(overrideArguments=None):          '--metadata-from-title',          metavar='FORMAT', dest='metafromtitle',          help='Parse additional metadata like song title / artist from the video title. ' -             'The format syntax is the same as --output, ' -             'the parsed parameters replace existing values. ' -             'Additional templates: %(album)s, %(artist)s. ' +             'The format syntax is the same as --output. Regular expression with ' +             'named capture groups may also be used. ' +             'The parsed parameters replace existing values. '               'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' -             '"Coldplay - Paradise"') +             '"Coldplay - Paradise". ' +             'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')      postproc.add_option(          '--xattrs',          action='store_true', dest='xattrs', default=False, diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 90630c2d7..64dabe790 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -4,7 +4,10 @@ import subprocess  from .common import PostProcessor  from ..compat import compat_shlex_quote -from ..utils import PostProcessingError +from ..utils import ( +    encodeArgument, +    PostProcessingError, +)  class ExecAfterDownloadPP(PostProcessor): @@ -20,7 +23,7 @@ class ExecAfterDownloadPP(PostProcessor):          cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))          self._downloader.to_screen('[exec] Executing command: %s' % cmd) -        retCode = subprocess.call(cmd, shell=True) +        retCode = subprocess.call(encodeArgument(cmd), shell=True)          if retCode != 0:              raise PostProcessingError(                  'Command returned error code %d' % retCode) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index c91ec8588..51256a3fb 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -444,7 +444,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          chapters = info.get('chapters', [])          if chapters: -            metadata_filename = encodeFilename(replace_extension(filename, 'meta')) +            metadata_filename = replace_extension(filename, 'meta')              with io.open(metadata_filename, 'wt', encoding='utf-8') as f:                  def ffmpeg_escape(text):                      return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) @@ -542,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor):              temp_filename = prepend_extension(filename, 'temp')              options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] -            self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) +            self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)              self.run_ffmpeg(filename, temp_filename, options)              os.remove(encodeFilename(filename)) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index a7d637a3c..f5c14d974 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor):      def __init__(self, downloader, titleformat):          super(MetadataFromTitlePP, self).__init__(downloader)          self._titleformat = titleformat -        self._titleregex = self.format_to_regex(titleformat) +        self._titleregex = (self.format_to_regex(titleformat) +                            if re.search(r'%\(\w+\)s', titleformat) +                            else titleformat)      def format_to_regex(self, fmt):          r""" @@ -33,11 +35,14 @@ class MetadataFromTitlePP(PostProcessor):          title = info['title']          match = re.match(self._titleregex, title)          if match is None: -            self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) +            self._downloader.to_screen( +                '[fromtitle] Could not interpret title of video as "%s"' +                % self._titleformat)              return [], info          for attribute, value in match.groupdict().items(): -            value = match.group(attribute)              info[attribute] = value -            self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value) +            self._downloader.to_screen( +                '[fromtitle] parsed %s: %s' +                % (attribute, value if value is not None else 'NA'))          return [], info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4d0685d83..9e4492d40 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -22,7 +22,6 @@ import locale  import math  import operator  import os -import pipes  import platform  import random  import re @@ -36,6 +35,7 @@ import xml.etree.ElementTree  import zlib  from .compat import ( +    compat_HTMLParseError,      compat_HTMLParser,      compat_basestring,      compat_chr, @@ -365,9 +365,9 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):      retlist = []      for m in re.finditer(r'''(?xs)          <([a-zA-Z0-9:._-]+) -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?           \s+%s=['"]?%s['"]? -         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? +         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?          \s*>          (?P<content>.*?)          </\1> @@ -409,8 +409,12 @@ def extract_attributes(html_element):      but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.      """      parser = HTMLAttributeParser() -    parser.feed(html_element) -    parser.close() +    try: +        parser.feed(html_element) +        parser.close() +    # Older Python may throw HTMLParseError in case of malformed HTML +    except compat_HTMLParseError: +        pass      return parser.attrs @@ -592,7 +596,7 @@ def unescapeHTML(s):      assert type(s) == compat_str      return re.sub( -        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) +        r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)  def get_subprocess_encoding(): @@ -932,14 +936,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          except zlib.error:              return zlib.decompress(data) -    @staticmethod -    def addinfourl_wrapper(stream, headers, url, code): -        if hasattr(compat_urllib_request.addinfourl, 'getcode'): -            return compat_urllib_request.addinfourl(stream, headers, url, code) -        ret = compat_urllib_request.addinfourl(stream, headers, url) -        ret.code = code -        return ret -      def http_request(self, req):          # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not          # always respected by websites, some tend to give out URLs with non percent-encoded @@ -991,13 +987,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):                      break                  else:                      raise original_ioerror -            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) +            resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg              del resp.headers['Content-encoding']          # deflate          if resp.headers.get('Content-encoding', '') == 'deflate':              gz = io.BytesIO(self.deflate(resp.read())) -            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) +            resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg              del resp.headers['Content-encoding']          # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see @@ -1187,7 +1183,7 @@ def unified_timestamp(date_str, day_first=True):      if date_str is None:          return None -    date_str = date_str.replace(',', ' ') +    date_str = re.sub(r'[,|]', '', date_str)      pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0      timezone, date_str = extract_timezone(date_str) @@ -1538,7 +1534,7 @@ def shell_quote(args):          if isinstance(a, bytes):              # We may get a filename encoded with 'encodeFilename'              a = a.decode(encoding) -        quoted_args.append(pipes.quote(a)) +        quoted_args.append(compat_shlex_quote(a))      return ' '.join(quoted_args) @@ -1819,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):          return default +def bool_or_none(v, default=None): +    return v if isinstance(v, bool) else default + +  def strip_or_none(v):      return None if v is None else v.strip() @@ -2098,7 +2098,7 @@ def update_Request(req, url=None, data=None, headers={}, query={}):      return new_req -def try_multipart_encode(data, boundary): +def _multipart_encode_impl(data, boundary):      content_type = 'multipart/form-data; boundary=%s' % boundary      out = b'' @@ -2110,7 +2110,7 @@ def try_multipart_encode(data, boundary):              v = v.encode('utf-8')          # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578          # suggests sending UTF-8 directly. Firefox sends UTF-8, too -        content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n' +        content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'          if boundary.encode('ascii') in content:              raise ValueError('Boundary overlaps with data')          out += content @@ -2140,7 +2140,7 @@ def multipart_encode(data, boundary=None):              boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))          try: -            out, content_type = try_multipart_encode(data, boundary) +            out, content_type = _multipart_encode_impl(data, boundary)              break          except ValueError:              if has_specified_boundary: @@ -2211,7 +2211,12 @@ def parse_age_limit(s):  def strip_jsonp(code):      return re.sub( -        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) +        r'''(?sx)^ +            (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) +            (?:\s*&&\s*(?P=func_name))? +            \s*\(\s*(?P<callback_data>.*)\);? +            \s*?(?://[^\n]*)*$''', +        r'\g<callback_data>', code)  def js_to_json(code): @@ -2360,11 +2365,11 @@ def parse_codecs(codecs_str):          if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):              if not vcodec:                  vcodec = full_codec -        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): +        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):              if not acodec:                  acodec = full_codec          else: -            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) +            write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)      if not vcodec and not acodec:          if len(splited_codecs) == 2:              return { @@ -2732,6 +2737,8 @@ def cli_option(params, command_option, param):  def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):      param = params.get(param) +    if param is None: +        return []      assert isinstance(param, bool)      if separator:          return [command_option + separator + (true_value if param else false_value)] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c19ac49b0..8399c04fe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2017.05.01' +__version__ = '2017.09.15'  | 
