diff options
Diffstat (limited to 'youtube_dl')
24 files changed, 756 insertions, 291 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 702a6ad50..cad6b026e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -21,24 +21,24 @@ import subprocess  import socket  import sys  import time +import tokenize  import traceback  if os.name == 'nt':      import ctypes  from .compat import ( -    compat_basestring,      compat_cookiejar,      compat_expanduser,      compat_get_terminal_size,      compat_http_client,      compat_kwargs,      compat_str, +    compat_tokenize_tokenize,      compat_urllib_error,      compat_urllib_request,  )  from .utils import ( -    escape_url,      ContentTooShortError,      date_from_str,      DateRange, @@ -49,7 +49,6 @@ from .utils import (      ExtractorError,      format_bytes,      formatSeconds, -    HEADRequest,      locked_file,      make_HTTPS_handler,      MaxDownloadsReached, @@ -853,8 +852,8 @@ class YoutubeDL(object):          else:              raise Exception('Invalid result type: %s' % result_type) -    def _apply_format_filter(self, format_spec, available_formats): -        " Returns a tuple of the remaining format_spec and filtered formats " +    def _build_format_filter(self, filter_spec): +        " Returns a function to filter the formats according to the filter_spec "          OPERATORS = {              '<': operator.lt, @@ -864,13 +863,13 @@ class YoutubeDL(object):              '=': operator.eq,              '!=': operator.ne,          } -        operator_rex = re.compile(r'''(?x)\s*\[ +        operator_rex = re.compile(r'''(?x)\s*              (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)              \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*              (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) -            \]$ +            $              ''' % '|'.join(map(re.escape, OPERATORS.keys()))) -        m = operator_rex.search(format_spec) +        m = operator_rex.search(filter_spec)          if m:              try:                  comparison_value = int(m.group('value')) @@ -881,7 +880,7 @@ class YoutubeDL(object):                  if comparison_value is None:                      raise ValueError(                          'Invalid value %r in format specification %r' % ( -                            m.group('value'), format_spec)) +                            m.group('value'), filter_spec))              op = OPERATORS[m.group('op')]          if not m: @@ -889,85 +888,283 @@ class YoutubeDL(object):                  '=': operator.eq,                  '!=': operator.ne,              } -            str_operator_rex = re.compile(r'''(?x)\s*\[ +            str_operator_rex = re.compile(r'''(?x)                  \s*(?P<key>ext|acodec|vcodec|container|protocol)                  \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?                  \s*(?P<value>[a-zA-Z0-9_-]+) -                \s*\]$ +                \s*$                  ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) -            m = str_operator_rex.search(format_spec) +            m = str_operator_rex.search(filter_spec)              if m:                  comparison_value = m.group('value')                  op = STR_OPERATORS[m.group('op')]          if not m: -            raise ValueError('Invalid format specification %r' % format_spec) +            raise ValueError('Invalid filter specification %r' % filter_spec)          def _filter(f):              actual_value = f.get(m.group('key'))              if actual_value is None:                  return m.group('none_inclusive')              return op(actual_value, comparison_value) -        new_formats = [f for f in available_formats if _filter(f)] +        return _filter + +    def build_format_selector(self, format_spec): +        def syntax_error(note, start): +            message = ( +                'Invalid format specification: ' +                '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) +            return SyntaxError(message) + +        PICKFIRST = 'PICKFIRST' +        MERGE = 'MERGE' +        SINGLE = 'SINGLE' +        GROUP = 'GROUP' +        FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + +        def _parse_filter(tokens): +            filter_parts = [] +            for type, string, start, _, _ in tokens: +                if type == tokenize.OP and string == ']': +                    return ''.join(filter_parts) +                else: +                    filter_parts.append(string) + +        def _remove_unused_ops(tokens): +            # Remove operators that we don't use and join them with the sourrounding strings +            # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' +            ALLOWED_OPS = ('/', '+', ',', '(', ')') +            last_string, last_start, last_end, last_line = None, None, None, None +            for type, string, start, end, line in tokens: +                if type == tokenize.OP and string == '[': +                    if last_string: +                        yield tokenize.NAME, last_string, last_start, last_end, last_line +                        last_string = None +                    yield type, string, start, end, line +                    # everything inside brackets will be handled by _parse_filter +                    for type, string, start, end, line in tokens: +                        yield type, string, start, end, line +                        if type == tokenize.OP and string == ']': +                            break +                elif type == tokenize.OP and string in ALLOWED_OPS: +                    if last_string: +                        yield tokenize.NAME, last_string, last_start, last_end, last_line +                        last_string = None +                    yield type, string, start, end, line +                elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: +                    if not last_string: +                        last_string = string +                        last_start = start +                        last_end = end +                    else: +                        last_string += string +            if last_string: +                yield tokenize.NAME, last_string, last_start, last_end, last_line + +        def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): +            selectors = [] +            current_selector = None +            for type, string, start, _, _ in tokens: +                # ENCODING is only defined in python 3.x +                if type == getattr(tokenize, 'ENCODING', None): +                    continue +                elif type in [tokenize.NAME, tokenize.NUMBER]: +                    current_selector = FormatSelector(SINGLE, string, []) +                elif type == tokenize.OP: +                    if string == ')': +                        if not inside_group: +                            # ')' will be handled by the parentheses group +                            tokens.restore_last_token() +                        break +                    elif inside_merge and string in ['/', ',']: +                        tokens.restore_last_token() +                        break +                    elif inside_choice and string == ',': +                        tokens.restore_last_token() +                        break +                    elif string == ',': +                        if not current_selector: +                            raise syntax_error('"," must follow a format selector', start) +                        selectors.append(current_selector) +                        current_selector = None +                    elif string == '/': +                        if not current_selector: +                            raise syntax_error('"/" must follow a format selector', start) +                        first_choice = current_selector +                        second_choice = _parse_format_selection(tokens, inside_choice=True) +                        current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) +                    elif string == '[': +                        if not current_selector: +                            current_selector = FormatSelector(SINGLE, 'best', []) +                        format_filter = _parse_filter(tokens) +                        current_selector.filters.append(format_filter) +                    elif string == '(': +                        if current_selector: +                            raise syntax_error('Unexpected "("', start) +                        group = _parse_format_selection(tokens, inside_group=True) +                        current_selector = FormatSelector(GROUP, group, []) +                    elif string == '+': +                        video_selector = current_selector +                        audio_selector = _parse_format_selection(tokens, inside_merge=True) +                        if not video_selector or not audio_selector: +                            raise syntax_error('"+" must be between two format selectors', start) +                        current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) +                    else: +                        raise syntax_error('Operator not recognized: "{0}"'.format(string), start) +                elif type == tokenize.ENDMARKER: +                    break +            if current_selector: +                selectors.append(current_selector) +            return selectors + +        def _build_selector_function(selector): +            if isinstance(selector, list): +                fs = [_build_selector_function(s) for s in selector] + +                def selector_function(formats): +                    for f in fs: +                        for format in f(formats): +                            yield format +                return selector_function +            elif selector.type == GROUP: +                selector_function = _build_selector_function(selector.selector) +            elif selector.type == PICKFIRST: +                fs = [_build_selector_function(s) for s in selector.selector] + +                def selector_function(formats): +                    for f in fs: +                        picked_formats = list(f(formats)) +                        if picked_formats: +                            return picked_formats +                    return [] +            elif selector.type == SINGLE: +                format_spec = selector.selector + +                def selector_function(formats): +                    formats = list(formats) +                    if not formats: +                        return +                    if format_spec == 'all': +                        for f in formats: +                            yield f +                    elif format_spec in ['best', 'worst', None]: +                        format_idx = 0 if format_spec == 'worst' else -1 +                        audiovideo_formats = [ +                            f for f in formats +                            if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] +                        if audiovideo_formats: +                            yield audiovideo_formats[format_idx] +                        # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format +                        elif (all(f.get('acodec') != 'none' for f in formats) or +                              all(f.get('vcodec') != 'none' for f in formats)): +                            yield formats[format_idx] +                    elif format_spec == 'bestaudio': +                        audio_formats = [ +                            f for f in formats +                            if f.get('vcodec') == 'none'] +                        if audio_formats: +                            yield audio_formats[-1] +                    elif format_spec == 'worstaudio': +                        audio_formats = [ +                            f for f in formats +                            if f.get('vcodec') == 'none'] +                        if audio_formats: +                            yield audio_formats[0] +                    elif format_spec == 'bestvideo': +                        video_formats = [ +                            f for f in formats +                            if f.get('acodec') == 'none'] +                        if video_formats: +                            yield video_formats[-1] +                    elif format_spec == 'worstvideo': +                        video_formats = [ +                            f for f in formats +                            if f.get('acodec') == 'none'] +                        if video_formats: +                            yield video_formats[0] +                    else: +                        extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] +                        if format_spec in extensions: +                            filter_f = lambda f: f['ext'] == format_spec +                        else: +                            filter_f = lambda f: f['format_id'] == format_spec +                        matches = list(filter(filter_f, formats)) +                        if matches: +                            yield matches[-1] +            elif selector.type == MERGE: +                def _merge(formats_info): +                    format_1, format_2 = [f['format_id'] for f in formats_info] +                    # The first format must contain the video and the +                    # second the audio +                    if formats_info[0].get('vcodec') == 'none': +                        self.report_error('The first format must ' +                                          'contain the video, try using ' +                                          '"-f %s+%s"' % (format_2, format_1)) +                        return +                    output_ext = ( +                        formats_info[0]['ext'] +                        if self.params.get('merge_output_format') is None +                        else self.params['merge_output_format']) +                    return { +                        'requested_formats': formats_info, +                        'format': '%s+%s' % (formats_info[0].get('format'), +                                             formats_info[1].get('format')), +                        'format_id': '%s+%s' % (formats_info[0].get('format_id'), +                                                formats_info[1].get('format_id')), +                        'width': formats_info[0].get('width'), +                        'height': formats_info[0].get('height'), +                        'resolution': formats_info[0].get('resolution'), +                        'fps': formats_info[0].get('fps'), +                        'vcodec': formats_info[0].get('vcodec'), +                        'vbr': formats_info[0].get('vbr'), +                        'stretched_ratio': formats_info[0].get('stretched_ratio'), +                        'acodec': formats_info[1].get('acodec'), +                        'abr': formats_info[1].get('abr'), +                        'ext': output_ext, +                    } +                video_selector, audio_selector = map(_build_selector_function, selector.selector) -        new_format_spec = format_spec[:-len(m.group(0))] -        if not new_format_spec: -            new_format_spec = 'best' +                def selector_function(formats): +                    formats = list(formats) +                    for pair in itertools.product(video_selector(formats), audio_selector(formats)): +                        yield _merge(pair) -        return (new_format_spec, new_formats) +            filters = [self._build_format_filter(f) for f in selector.filters] -    def select_format(self, format_spec, available_formats): -        while format_spec.endswith(']'): -            format_spec, available_formats = self._apply_format_filter( -                format_spec, available_formats) -        if not available_formats: -            return None +            def final_selector(formats): +                for _filter in filters: +                    formats = list(filter(_filter, formats)) +                return selector_function(formats) +            return final_selector -        if format_spec in ['best', 'worst', None]: -            format_idx = 0 if format_spec == 'worst' else -1 -            audiovideo_formats = [ -                f for f in available_formats -                if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] -            if audiovideo_formats: -                return audiovideo_formats[format_idx] -            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format -            elif (all(f.get('acodec') != 'none' for f in available_formats) or -                  all(f.get('vcodec') != 'none' for f in available_formats)): -                return available_formats[format_idx] -        elif format_spec == 'bestaudio': -            audio_formats = [ -                f for f in available_formats -                if f.get('vcodec') == 'none'] -            if audio_formats: -                return audio_formats[-1] -        elif format_spec == 'worstaudio': -            audio_formats = [ -                f for f in available_formats -                if f.get('vcodec') == 'none'] -            if audio_formats: -                return audio_formats[0] -        elif format_spec == 'bestvideo': -            video_formats = [ -                f for f in available_formats -                if f.get('acodec') == 'none'] -            if video_formats: -                return video_formats[-1] -        elif format_spec == 'worstvideo': -            video_formats = [ -                f for f in available_formats -                if f.get('acodec') == 'none'] -            if video_formats: -                return video_formats[0] -        else: -            extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] -            if format_spec in extensions: -                filter_f = lambda f: f['ext'] == format_spec -            else: -                filter_f = lambda f: f['format_id'] == format_spec -            matches = list(filter(filter_f, available_formats)) -            if matches: -                return matches[-1] -        return None +        stream = io.BytesIO(format_spec.encode('utf-8')) +        try: +            tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) +        except tokenize.TokenError: +            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + +        class TokenIterator(object): +            def __init__(self, tokens): +                self.tokens = tokens +                self.counter = 0 + +            def __iter__(self): +                return self + +            def __next__(self): +                if self.counter >= len(self.tokens): +                    raise StopIteration() +                value = self.tokens[self.counter] +                self.counter += 1 +                return value + +            next = __next__ + +            def restore_last_token(self): +                self.counter -= 1 + +        parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) +        return _build_selector_function(parsed_selector)      def _calc_headers(self, info_dict):          res = std_headers.copy() @@ -1111,56 +1308,8 @@ class YoutubeDL(object):                      req_format_list.append('bestvideo+bestaudio')              req_format_list.append('best')              req_format = '/'.join(req_format_list) -        formats_to_download = [] -        if req_format == 'all': -            formats_to_download = formats -        else: -            for rfstr in req_format.split(','): -                # We can accept formats requested in the format: 34/5/best, we pick -                # the first that is available, starting from left -                req_formats = rfstr.split('/') -                for rf in req_formats: -                    if re.match(r'.+?\+.+?', rf) is not None: -                        # Two formats have been requested like '137+139' -                        format_1, format_2 = rf.split('+') -                        formats_info = (self.select_format(format_1, formats), -                                        self.select_format(format_2, formats)) -                        if all(formats_info): -                            # The first format must contain the video and the -                            # second the audio -                            if formats_info[0].get('vcodec') == 'none': -                                self.report_error('The first format must ' -                                                  'contain the video, try using ' -                                                  '"-f %s+%s"' % (format_2, format_1)) -                                return -                            output_ext = ( -                                formats_info[0]['ext'] -                                if self.params.get('merge_output_format') is None -                                else self.params['merge_output_format']) -                            selected_format = { -                                'requested_formats': formats_info, -                                'format': '%s+%s' % (formats_info[0].get('format'), -                                                     formats_info[1].get('format')), -                                'format_id': '%s+%s' % (formats_info[0].get('format_id'), -                                                        formats_info[1].get('format_id')), -                                'width': formats_info[0].get('width'), -                                'height': formats_info[0].get('height'), -                                'resolution': formats_info[0].get('resolution'), -                                'fps': formats_info[0].get('fps'), -                                'vcodec': formats_info[0].get('vcodec'), -                                'vbr': formats_info[0].get('vbr'), -                                'stretched_ratio': formats_info[0].get('stretched_ratio'), -                                'acodec': formats_info[1].get('acodec'), -                                'abr': formats_info[1].get('abr'), -                                'ext': output_ext, -                            } -                        else: -                            selected_format = None -                    else: -                        selected_format = self.select_format(rf, formats) -                    if selected_format is not None: -                        formats_to_download.append(selected_format) -                        break +        format_selector = self.build_format_selector(req_format) +        formats_to_download = list(format_selector(formats))          if not formats_to_download:              raise ExtractorError('requested format not available',                                   expected=True) @@ -1708,27 +1857,6 @@ class YoutubeDL(object):      def urlopen(self, req):          """ Start an HTTP download """ - -        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not -        # always respected by websites, some tend to give out URLs with non percent-encoded -        # non-ASCII characters (see telemb.py, ard.py [#3412]) -        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) -        # To work around aforementioned issue we will replace request's original URL with -        # percent-encoded one -        req_is_string = isinstance(req, compat_basestring) -        url = req if req_is_string else req.get_full_url() -        url_escaped = escape_url(url) - -        # Substitute URL if any change after escaping -        if url != url_escaped: -            if req_is_string: -                req = url_escaped -            else: -                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request -                req = req_type( -                    url_escaped, data=req.data, headers=req.headers, -                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) -          return self._opener.open(req, timeout=self._socket_timeout)      def print_debug_header(self): diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e4b9286c0..ace5bd716 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -436,6 +436,11 @@ except TypeError:  # Python 2.6              yield n              n += step +if sys.version_info >= (3, 0): +    from tokenize import tokenize as compat_tokenize_tokenize +else: +    from tokenize import generate_tokens as compat_tokenize_tokenize +  __all__ = [      'compat_HTTPError',      'compat_basestring', @@ -457,6 +462,7 @@ __all__ = [      'compat_socket_create_connection',      'compat_str',      'compat_subprocess_get_DEVNULL', +    'compat_tokenize_tokenize',      'compat_urllib_error',      'compat_urllib_parse',      'compat_urllib_parse_unquote', diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 1d5cc9904..30699934b 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -83,6 +83,16 @@ class CurlFD(ExternalFD):          return cmd +class AxelFD(ExternalFD): +    def _make_cmd(self, tmpfilename, info_dict): +        cmd = [self.exe, '-o', tmpfilename] +        for key, val in info_dict['http_headers'].items(): +            cmd += ['-H', '%s: %s' % (key, val)] +        cmd += self._configuration_args() +        cmd += ['--', info_dict['url']] +        return cmd + +  class WgetFD(ExternalFD):      def _make_cmd(self, tmpfilename, info_dict):          cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..a29f5cf31 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,6 +4,7 @@ import errno  import os  import socket  import time +import re  from .common import FileDownloader  from ..compat import ( @@ -57,6 +58,24 @@ class HttpFD(FileDownloader):              # Establish connection              try:                  data = self.ydl.urlopen(request) +                # When trying to resume, Content-Range HTTP header of response has to be checked +                # to match the value of requested Range HTTP header. This is due to a webservers +                # that don't support resuming and serve a whole file with no Content-Range +                # set in response despite of requested Range (see +                # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) +                if resume_len > 0: +                    content_range = data.headers.get('Content-Range') +                    if content_range: +                        content_range_m = re.search(r'bytes (\d+)-', content_range) +                        # Content-Range is present and matches requested Range, resume is possible +                        if content_range_m and resume_len == int(content_range_m.group(1)): +                            break +                    # Content-Range is either not present or invalid. Assuming remote webserver is +                    # trying to send the whole file, resume is not possible, so wiping the local file +                    # and performing entire redownload +                    self.report_unable_to_resume() +                    resume_len = 0 +                    open_mode = 'wb'                  break              except (compat_urllib_error.HTTPError, ) as err:                  if (err.code < 500 or err.code >= 600) and err.code != 416: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7e5c90829..e38e77a27 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -118,6 +118,7 @@ from .dailymotion import (  )  from .daum import DaumIE  from .dbtv import DBTVIE +from .dcn import DCNIE  from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .dfb import DFBIE @@ -431,6 +432,10 @@ from .orf import (  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE +from .periscope import ( +    PeriscopeIE, +    QuickscopeIE, +)  from .philharmoniedeparis import PhilharmonieDeParisIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9a1b6e3dc..abc5a44a1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -527,6 +527,18 @@ class BBCIE(BBCCoUkIE):              'skip_download': True,          }      }, { +        # single video from video playlist embedded with vxp-playlist-data JSON +        'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', +        'info_dict': { +            'id': 'p02w6qjc', +            'ext': 'mp4', +            'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', +            'duration': 56, +        }, +        'params': { +            'skip_download': True, +        } +    }, {          # single video story with digitalData          'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',          'info_dict': { @@ -695,13 +707,36 @@ class BBCIE(BBCCoUkIE):          if not medias:              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) -            media_asset_page = self._parse_json( +            media_asset = self._search_regex( +                r'mediaAssetPage\.init\(\s*({.+?}), "/', +                webpage, 'media asset', default=None) +            if media_asset: +                media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) +                medias = [] +                for video in media_asset_page.get('videos', {}).values(): +                    medias.extend(video.values()) + +        if not medias: +            # Multiple video playlist with single `now playing` entry (e.g. +            # http://www.bbc.com/news/video_and_audio/must_see/33767813) +            vxp_playlist = self._parse_json(                  self._search_regex( -                    r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), +                    r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', +                    webpage, 'playlist data'),                  playlist_id) -            medias = [] -            for video in media_asset_page.get('videos', {}).values(): -                medias.extend(video.values()) +            playlist_medias = [] +            for item in vxp_playlist: +                media = item.get('media') +                if not media: +                    continue +                playlist_medias.append(media) +                # Download single video if found media with asset id matching the video id from URL +                if item.get('advert', {}).get('assetId') == playlist_id: +                    medias = [media] +                    break +            # Fallback to the whole playlist +            if not medias: +                medias = playlist_medias          entries = []          for num, media_meta in enumerate(medias, start=1): diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index a5c3cb7c6..7af903571 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,53 +1,68 @@  from __future__ import unicode_literals  import re -import time -import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( -    ExtractorError, -    parse_duration, +    determine_ext, +    int_or_none, +    js_to_json, +    parse_iso8601, +    remove_end,  )  class ClipfishIE(InfoExtractor): -    IE_NAME = 'clipfish' - -    _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' +    _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', -        'md5': '2521cd644e862936cf2e698206e47385', +        'md5': '79bc922f3e8a9097b3d68a93780fd475',          'info_dict': {              'id': '3966754',              'ext': 'mp4',              'title': 'FIFA 14 - E3 2013 Trailer', +            'timestamp': 1370938118, +            'upload_date': '20130611',              'duration': 82, -        }, -        'skip': 'Blocked in the US' +        }      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) - -        info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % -                    (video_id, int(time.time()))) -        doc = self._download_xml( -            info_url, video_id, note='Downloading info page') -        title = doc.find('title').text -        video_url = doc.find('filename').text -        if video_url is None: -            xml_bytes = xml.etree.ElementTree.tostring(doc) -            raise ExtractorError('Cannot find video URL in document %r' % -                                 xml_bytes) -        thumbnail = doc.find('imageurl').text -        duration = parse_duration(doc.find('duration').text) +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_info = self._parse_json( +            js_to_json(self._html_search_regex( +                '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), +            video_id) + +        formats = [] +        for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): +            ext = determine_ext(video_url) +            if ext == 'm3u8': +                formats.append({ +                    'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), +                    'ext': 'mp4', +                    'format_id': 'hls', +                }) +            else: +                formats.append({ +                    'url': video_url, +                    'format_id': ext, +                }) +        self._sort_formats(formats) + +        title = remove_end(self._og_search_title(webpage), ' - Video') +        thumbnail = self._og_search_thumbnail(webpage) +        duration = int_or_none(video_info.get('length')) +        timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date'))          return {              'id': video_id,              'title': title, -            'url': video_url, +            'formats': formats,              'thumbnail': thumbnail,              'duration': duration, +            'timestamp': timestamp,          } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 717dcec7b..def6caa0d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -638,7 +638,7 @@ class InfoExtractor(object):      @staticmethod      def _meta_regex(prop):          return r'''(?isx)<meta -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) +                    (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)      def _og_search_property(self, prop, html, name=None, **kargs): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 85d945509..2d90b2224 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -15,7 +15,6 @@ from ..utils import (      ExtractorError,      determine_ext,      int_or_none, -    orderedSet,      parse_iso8601,      str_to_int,      unescapeHTML, @@ -278,7 +277,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):      }]      def _extract_entries(self, id): -        video_ids = [] +        video_ids = set()          processed_urls = set()          for pagenum in itertools.count(1):              page_url = self._PAGE_TEMPLATE % (id, pagenum) @@ -291,12 +290,13 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):              processed_urls.add(urlh.geturl()) -            video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) +            for video_id in re.findall(r'data-xid="(.+?)"', webpage): +                if video_id not in video_ids: +                    yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') +                    video_ids.add(video_id)              if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:                  break -        return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') -                for video_id in orderedSet(video_ids)]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py new file mode 100644 index 000000000..82261e25c --- /dev/null +++ b/youtube_dl/extractor/dcn.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import ( +    int_or_none, +    parse_iso8601, +) + + +class DCNIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', +        'info_dict': +        { +            'id': '17375', +            'ext': 'mp4', +            'title': 'رحلة العمر : الحلقة 1', +            'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 2041, +            'timestamp': 1227504126, +            'upload_date': '20081124', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        request = compat_urllib_request.Request( +            'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, +            headers={'Origin': 'http://www.dcndigital.ae'}) + +        video = self._download_json(request, video_id) +        title = video.get('title_en') or video['title_ar'] + +        webpage = self._download_webpage( +            'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' +            + compat_urllib_parse.urlencode({ +                'id': video['id'], +                'user_id': video['user_id'], +                'signature': video['signature'], +                'countries': 'Q0M=', +                'filter': 'DENY', +            }), video_id) + +        m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') +        formats = self._extract_m3u8_formats( +            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + +        rtsp_url = self._search_regex( +            r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) +        if rtsp_url: +            formats.append({ +                'url': rtsp_url, +                'format_id': 'rtsp', +            }) + +        self._sort_formats(formats) + +        img = video.get('img') +        thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None +        duration = int_or_none(video.get('duration')) +        description = video.get('description_en') or video.get('description_ar') +        timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e17bb9aea..178a7ca4c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,6 +17,8 @@ from ..utils import (      int_or_none,      limit_length,      urlencode_postdata, +    get_element_by_id, +    clean_html,  ) @@ -42,6 +44,7 @@ class FacebookIE(InfoExtractor):              'id': '637842556329505',              'ext': 'mp4',              'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', +            'uploader': 'Tennis on Facebook',          }      }, {          'note': 'Video without discernible title', @@ -50,6 +53,7 @@ class FacebookIE(InfoExtractor):              'id': '274175099429670',              'ext': 'mp4',              'title': 'Facebook video #274175099429670', +            'uploader': 'Asif Nawab Butt',          },          'expected_warnings': [              'title' @@ -161,6 +165,7 @@ class FacebookIE(InfoExtractor):              video_title = limit_length(video_title, 80)          if not video_title:              video_title = 'Facebook video #%s' % video_id +        uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))          return {              'id': video_id, @@ -168,4 +173,5 @@ class FacebookIE(InfoExtractor):              'formats': formats,              'duration': int_or_none(video_data.get('video_duration')),              'thumbnail': video_data.get('thumbnail_src'), +            'uploader': uploader,          } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 27584c44c..901f77304 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -304,6 +304,19 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Ooyala'],          }, +        { +            # ooyala video embedded with http://player.ooyala.com/iframe.js +            'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', +            'info_dict': { +                'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', +                'ext': 'mp4', +                'title': '"Steve Jobs: Man in the Machine" trailer', +                'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', +            }, +            'params': { +                'skip_download': True, +            }, +        },          # multiple ooyala embeds on SBN network websites          {              'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', @@ -1390,7 +1403,7 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'))          # Look for Ooyala videos -        mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or +        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) @@ -1725,7 +1738,7 @@ class GenericIE(InfoExtractor):          if not found:              # Broaden the findall a little bit: JWPlayer JS loader              found = filter_video(re.findall( -                r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) +                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))          if not found:              # Flow player              found = filter_video(re.findall(r'''(?xs) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a00f6e5e5..deead220a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,6 @@ from ..utils import (  class LyndaBaseIE(InfoExtractor):      _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' -    _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'      _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'      _NETRC_MACHINE = 'lynda' @@ -41,7 +40,7 @@ class LyndaBaseIE(InfoExtractor):              request, None, 'Logging in as %s' % username)          # Not (yet) logged in -        m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) +        m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)          if m is not None:              response = m.group('json')              response_json = json.loads(response) @@ -70,7 +69,7 @@ class LyndaBaseIE(InfoExtractor):                      request, None,                      'Confirming log in and log out from another device') -        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: +        if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):              raise ExtractorError('Unable to log in') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 5fdd19027..fc7499958 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -29,7 +29,7 @@ class MDRIE(InfoExtractor):          doc = self._download_xml(domain + xmlurl, video_id)          formats = []          for a in doc.findall('./assets/asset'): -            url_el = a.find('.//progressiveDownloadUrl') +            url_el = a.find('./progressiveDownloadUrl')              if url_el is None:                  continue              abr = int(a.find('bitrateAudio').text) // 1000 diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 0b5ff4760..66c627bec 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,12 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import (      ExtractorError, +    determine_ext,      int_or_none,      parse_iso8601,      parse_duration, @@ -15,7 +14,7 @@ from ..utils import (  class NowTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' +    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)'      _TESTS = [{          # rtl @@ -23,7 +22,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '203519',              'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Die neuen Bauern und eine Hochzeit',              'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',              'thumbnail': 're:^https?://.*\.jpg$', @@ -32,7 +31,7 @@ class NowTVIE(InfoExtractor):              'duration': 2786,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          },      }, { @@ -41,7 +40,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '203481',              'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Berlin - Tag & Nacht (Folge 934)',              'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',              'thumbnail': 're:^https?://.*\.jpg$', @@ -50,7 +49,7 @@ class NowTVIE(InfoExtractor):              'duration': 2641,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          },      }, { @@ -59,7 +58,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '165780',              'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Hals- und Beinbruch',              'description': 'md5:b50d248efffe244e6f56737f0911ca57',              'thumbnail': 're:^https?://.*\.jpg$', @@ -68,7 +67,7 @@ class NowTVIE(InfoExtractor):              'duration': 2742,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          },      }, { @@ -77,7 +76,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '99205',              'display_id': 'medicopter-117/angst', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Angst!',              'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',              'thumbnail': 're:^https?://.*\.jpg$', @@ -86,7 +85,7 @@ class NowTVIE(InfoExtractor):              'duration': 3025,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          },      }, { @@ -95,7 +94,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '203521',              'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',              'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',              'thumbnail': 're:^https?://.*\.jpg$', @@ -104,7 +103,7 @@ class NowTVIE(InfoExtractor):              'duration': 1083,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          },      }, { @@ -113,7 +112,7 @@ class NowTVIE(InfoExtractor):          'info_dict': {              'id': '128953',              'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', -            'ext': 'mp4', +            'ext': 'flv',              'title': "Büro-Fall / Chihuahua 'Joel'",              'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',              'thumbnail': 're:^https?://.*\.jpg$', @@ -122,15 +121,19 @@ class NowTVIE(InfoExtractor):              'duration': 3092,          },          'params': { -            # m3u8 download +            # rtmp download              'skip_download': True,          }, +    }, { +        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', +        'only_matching': True, +    }, { +        'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', +        'only_matching': True,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('id') -        station = mobj.group('station') +        display_id = self._match_id(url)          info = self._download_json(              'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, @@ -148,29 +151,19 @@ class NowTVIE(InfoExtractor):                  raise ExtractorError(                      'Video %s is not available for free' % video_id, expected=True) -        f = info.get('format', {}) -        station = f.get('station') or station - -        STATIONS = { -            'rtl': 'rtlnow', -            'rtl2': 'rtl2now', -            'vox': 'voxnow', -            'nitro': 'rtlnitronow', -            'ntv': 'n-tvnow', -            'superrtl': 'superrtlnow' -        } -          formats = []          for item in files['items']: -            item_path = remove_start(item['path'], '/') -            tbr = int_or_none(item['bitrate']) -            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) -            m3u8_url = m3u8_url.replace('now/', 'now/videos/') +            if determine_ext(item['path']) != 'f4v': +                continue +            app, play_path = remove_start(item['path'], '/').split('/', 1)              formats.append({ -                'url': m3u8_url, -                'format_id': '%s-%sk' % (item['id'], tbr), -                'ext': 'mp4', -                'tbr': tbr, +                'url': 'rtmpe://fms.rtl.de', +                'app': app, +                'play_path': 'mp4:%s' % play_path, +                'ext': 'flv', +                'page_url': url, +                'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', +                'tbr': int_or_none(item.get('bitrate')),              })          self._sort_formats(formats) @@ -178,6 +171,8 @@ class NowTVIE(InfoExtractor):          description = info.get('articleLong') or info.get('articleShort')          timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')          duration = parse_duration(info.get('duration')) + +        f = info.get('format', {})          thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')          return { diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py new file mode 100644 index 000000000..8ad936758 --- /dev/null +++ b/youtube_dl/extractor/periscope.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import parse_iso8601 + + +class PeriscopeIE(InfoExtractor): +    IE_DESC = 'Periscope' +    _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' +    _TEST = { +        'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', +        'md5': '65b57957972e503fcbbaeed8f4fa04ca', +        'info_dict': { +            'id': '56102209', +            'ext': 'mp4', +            'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', +            'timestamp': 1438978559, +            'upload_date': '20150807', +            'uploader': 'Bec Boop', +            'uploader_id': '1465763', +        }, +        'skip': 'Expires in 24 hours', +    } + +    def _call_api(self, method, token): +        return self._download_json( +            'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + +    def _real_extract(self, url): +        token = self._match_id(url) + +        broadcast_data = self._call_api('getBroadcastPublic', token) +        broadcast = broadcast_data['broadcast'] +        status = broadcast['status'] + +        uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') +        uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') + +        title = '%s - %s' % (uploader, status) if uploader else status +        state = broadcast.get('state').lower() +        if state == 'running': +            title = self._live_title(title) +        timestamp = parse_iso8601(broadcast.get('created_at')) + +        thumbnails = [{ +            'url': broadcast[image], +        } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + +        stream = self._call_api('getAccessPublic', token) + +        formats = [] +        for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): +            video_url = stream.get(format_id + '_url') +            if not video_url: +                continue +            f = { +                'url': video_url, +                'ext': 'flv' if format_id == 'rtmp' else 'mp4', +            } +            if format_id != 'rtmp': +                f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8' +            formats.append(f) +        self._sort_formats(formats) + +        return { +            'id': broadcast.get('id') or token, +            'title': title, +            'timestamp': timestamp, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'thumbnails': thumbnails, +            'formats': formats, +        } + + +class QuickscopeIE(InfoExtractor): +    IE_DESC = 'Quick Scope' +    _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' +    _TEST = { +        'url': 'https://watchonperiscope.com/broadcast/56180087', +        'only_matching': True, +    } + +    def _real_extract(self, url): +        broadcast_id = self._match_id(url) +        request = compat_urllib_request.Request( +            'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({ +                'broadcast_id': broadcast_id, +                'entry_ticket': '', +                'from_push': 'false', +                'uses_sessions': 'true', +            }).encode('utf-8')) +        return self.url_result( +            self._download_json(request, broadcast_id)['share_url'], 'Periscope') diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0b7886840..7b0cdc41a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor):          comment_count = self._extract_count(              r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') -        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) +        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))          if webpage.find('"encrypted":true') != -1:              password = compat_urllib_parse_unquote_plus(                  self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) @@ -94,7 +94,7 @@ class PornHubIE(InfoExtractor):              format = path.split('/')[5].split('_')[:2]              format = "-".join(format) -            m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format) +            m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)              if m is None:                  height = None                  tbr = None diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index d1ab66b32..3bc84989e 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -1,12 +1,11 @@  # encoding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      int_or_none,      unified_strdate, +    js_to_json,  ) @@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor):          video_id = self._match_id(url)          playerdata = self._download_webpage( -            'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id, +            'http://player.screenwavemedia.com/player.php?id=%s' % video_id,              video_id, 'Downloading player webpage')          vidtitle = self._search_regex(              r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') -        vidurl = self._search_regex( -            r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/') - -        videolist_url = None - -        mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) -        if mobj: -            videoserver = mobj.group('videoserver') -            mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) -            vidid = mobj.group('vidid') if mobj else video_id -            videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) -        else: -            mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) -            if mobj: -                videolist_url = mobj.group('smil') - -        if videolist_url: -            videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') -            formats = [] -            baseurl = vidurl[:vidurl.rfind('/') + 1] -            for video in videolist.findall('.//video'): -                src = video.get('src') -                if not src: -                    continue -                file_ = src.partition(':')[-1] -                width = int_or_none(video.get('width')) -                height = int_or_none(video.get('height')) -                bitrate = int_or_none(video.get('system-bitrate'), scale=1000) -                format = { -                    'url': baseurl + file_, -                    'format_id': src.rpartition('.')[0].rpartition('_')[-1], -                } -                if width or height: -                    format.update({ -                        'tbr': bitrate, -                        'width': width, -                        'height': height, -                    }) -                else: -                    format.update({ -                        'abr': bitrate, -                        'vcodec': 'none', -                    }) -                formats.append(format) -        else: -            formats = [{ -                'url': vidurl, -            }] + +        playerconfig = self._download_webpage( +            'http://player.screenwavemedia.com/player.js', +            video_id, 'Downloading playerconfig webpage') + +        videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') + +        sources = self._parse_json( +            js_to_json( +                self._search_regex( +                    r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, +                    'sources', +                ).replace( +                    "' + thisObj.options.videoserver + '", +                    videoserver +                ).replace( +                    "' + playerVidId + '", +                    video_id +                ) +            ), +            video_id +        ) + +        formats = [] +        for source in sources: +            if source['type'] == 'hls': +                formats.extend(self._extract_m3u8_formats(source['file'], video_id)) +            else: +                format_label = source.get('label') +                height = int_or_none(self._search_regex( +                    r'^(\d+)[pP]', format_label, 'height', default=None)) +                formats.append({ +                    'url': source['file'], +                    'format': format_label, +                    'ext': source.get('type'), +                    'height': height, +                })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 7fb165a87..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE):              'title': 'The Government Won\'t Respect My Privacy',              'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',          }, +    }, { +        # non-ASCII characters in initial URL +        'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', +        'playlist_count': 4, +    }, { +        # non-ASCII characters in redirect URL +        'url': 'http://www.southpark.de/alle-episoden/s18e09', +        'playlist_count': 4,      }] diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c89de5ba4..84fe71aef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -29,6 +29,8 @@ class TudouIE(InfoExtractor):          }      }] +    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' +      def _url_for_id(self, id, quality=None):          info_url = "http://v2.tudou.com/f?id=" + str(id)          if quality: @@ -54,6 +56,10 @@ class TudouIE(InfoExtractor):          thumbnail_url = self._search_regex(              r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) +        player_url = self._search_regex( +            r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", +            webpage, 'player URL', default=self._PLAYER_URL) +          segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')          segments = json.loads(segs_json)          # It looks like the keys are the arguments that have to be passed as @@ -76,6 +82,9 @@ class TudouIE(InfoExtractor):                  'ext': ext,                  'title': title,                  'thumbnail': thumbnail_url, +                'http_headers': { +                    'Referer': player_url, +                },              }              result.append(part_info) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 73ce335b7..a2b6a35aa 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,12 +7,15 @@ import random  from .common import InfoExtractor  from ..compat import ( +    compat_parse_qs,      compat_str,      compat_urllib_parse, +    compat_urllib_parse_urlparse,      compat_urllib_request,  )  from ..utils import (      ExtractorError, +    parse_duration,      parse_iso8601,  ) @@ -185,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE):      _ITEM_SHORTCUT = 'v'      _TEST = { -        'url': 'http://www.twitch.tv/riotgames/v/6528877', +        'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',          'info_dict': {              'id': 'v6528877',              'ext': 'mp4', @@ -197,6 +200,7 @@ class TwitchVodIE(TwitchItemBaseIE):              'uploader': 'Riot Games',              'uploader_id': 'riotgames',              'view_count': int, +            'start_time': 310,          },          'params': {              # m3u8 download @@ -216,6 +220,12 @@ class TwitchVodIE(TwitchItemBaseIE):              item_id, 'mp4')          self._prefer_source(formats)          info['formats'] = formats + +        parsed_url = compat_urllib_parse_urlparse(url) +        query = compat_parse_qs(parsed_url.query) +        if 't' in query: +            info['start_time'] = parse_duration(query['t'][0]) +          return info diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b4ad513a0..97315750f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,7 +4,6 @@ import re  from .common import InfoExtractor  from ..utils import ( -    ExtractorError,      unified_strdate,      str_to_int,      int_or_none, @@ -22,7 +21,7 @@ class XHamsterIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'FemaleAgent Shy beauty takes the bait',                  'upload_date': '20121014', -                'uploader_id': 'Ruseful2011', +                'uploader': 'Ruseful2011',                  'duration': 893,                  'age_limit': 18,              } @@ -34,7 +33,7 @@ class XHamsterIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Britney Spears  Sexy Booty',                  'upload_date': '20130914', -                'uploader_id': 'jojo747400', +                'uploader': 'jojo747400',                  'duration': 200,                  'age_limit': 18,              } @@ -46,12 +45,12 @@ class XHamsterIE(InfoExtractor):      ]      def _real_extract(self, url): -        def extract_video_url(webpage): -            mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage) -            if mp4 is None: -                raise ExtractorError('Unable to extract media URL') -            else: -                return mp4.group(1) +        def extract_video_url(webpage, name): +            return self._search_regex( +                [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', +                 r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', +                 r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], +                webpage, name, group='mp4')          def is_hd(webpage):              return '<div class=\'icon iconHD\'' in webpage @@ -75,10 +74,14 @@ class XHamsterIE(InfoExtractor):          if upload_date:              upload_date = unified_strdate(upload_date) -        uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', -                                              webpage, 'uploader id', default='anonymous') +        uploader = self._html_search_regex( +            r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)", +            webpage, 'uploader', default='anonymous') -        thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False) +        thumbnail = self._search_regex( +            [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', +             r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], +            webpage, 'thumbnail', fatal=False, group='thumbnail')          duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',                                                            webpage, 'duration', fatal=False)) @@ -97,7 +100,9 @@ class XHamsterIE(InfoExtractor):          hd = is_hd(webpage) -        video_url = extract_video_url(webpage) +        format_id = 'hd' if hd else 'sd' + +        video_url = extract_video_url(webpage, format_id)          formats = [{              'url': video_url,              'format_id': 'hd' if hd else 'sd', @@ -108,7 +113,7 @@ class XHamsterIE(InfoExtractor):              mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')              webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')              if is_hd(webpage): -                video_url = extract_video_url(webpage) +                video_url = extract_video_url(webpage, 'hd')                  formats.append({                      'url': video_url,                      'format_id': 'hd', @@ -122,7 +127,7 @@ class XHamsterIE(InfoExtractor):              'title': title,              'description': description,              'upload_date': upload_date, -            'uploader_id': uploader_id, +            'uploader': uploader,              'thumbnail': thumbnail,              'duration': duration,              'view_count': view_count, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 78dc2b449..e265c7574 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -651,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          return ret      def http_request(self, req): +        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not +        # always respected by websites, some tend to give out URLs with non percent-encoded +        # non-ASCII characters (see telemb.py, ard.py [#3412]) +        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) +        # To work around aforementioned issue we will replace request's original URL with +        # percent-encoded one +        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) +        # the code of this workaround has been moved here from YoutubeDL.urlopen() +        url = req.get_full_url() +        url_escaped = escape_url(url) + +        # Substitute URL if any change after escaping +        if url != url_escaped: +            req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request +            new_req = req_type( +                url_escaped, data=req.data, headers=req.headers, +                origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) +            new_req.timeout = req.timeout +            req = new_req +          for h, v in std_headers.items():              # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275              # The dict keys are capitalized because of this bug by urllib @@ -695,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):              gz = io.BytesIO(self.deflate(resp.read()))              resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg +        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 +        if 300 <= resp.code < 400: +            location = resp.headers.get('Location') +            if location: +                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 +                if sys.version_info >= (3, 0): +                    location = location.encode('iso-8859-1').decode('utf-8') +                location_escaped = escape_url(location) +                if location != location_escaped: +                    del resp.headers['Location'] +                    resp.headers['Location'] = location_escaped          return resp      https_request = http_request diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fa157cadb..9f209499c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.07.28' +__version__ = '2015.08.06.1'  | 
