diff options
Diffstat (limited to 'youtube_dl')
54 files changed, 1610 insertions, 1111 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 68721e9ab..ace80f14b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -975,6 +975,8 @@ class YoutubeDL(object):                      'playlist': playlist,                      'playlist_id': ie_result.get('id'),                      'playlist_title': ie_result.get('title'), +                    'playlist_uploader': ie_result.get('uploader'), +                    'playlist_uploader_id': ie_result.get('uploader_id'),                      'playlist_index': i + playliststart,                      'extractor': ie_result['extractor'],                      'webpage_url': ie_result['webpage_url'], diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 7bb61a541..ea5e3a4b5 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -112,7 +112,7 @@ class FragmentFD(FileDownloader):              if self.__do_ytdl_file(ctx):                  self._write_ytdl_file(ctx)              if not self.params.get('keep_fragments', False): -                os.remove(ctx['fragment_filename_sanitized']) +                os.remove(encodeFilename(ctx['fragment_filename_sanitized']))              del ctx['fragment_filename_sanitized']      def _prepare_frag_download(self, ctx): diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 8a6638cc2..3ff26ff70 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -284,8 +284,7 @@ class HttpFD(FileDownloader):          while count <= retries:              try:                  establish_connection() -                download() -                return True +                return download()              except RetryDownload as e:                  count += 1                  if count <= retries: diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index e6513c7a4..513dd81df 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -228,10 +228,19 @@ class AfreecaTVIE(InfoExtractor):                      r'^(\d{8})_', key, 'upload date', default=None)                  file_duration = int_or_none(file_element.get('duration'))                  format_id = key if key else '%s_%s' % (video_id, file_num) -                formats = self._extract_m3u8_formats( -                    file_url, video_id, 'mp4', entry_protocol='m3u8_native', -                    m3u8_id='hls', -                    note='Downloading part %d m3u8 information' % file_num) +                if determine_ext(file_url) == 'm3u8': +                    formats = self._extract_m3u8_formats( +                        file_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        m3u8_id='hls', +                        note='Downloading part %d m3u8 information' % file_num) +                else: +                    formats = [{ +                        'url': file_url, +                        'format_id': 'http', +                    }] +                if not formats: +                    continue +                self._sort_formats(formats)                  file_info = common_entry.copy()                  file_info.update({                      'id': format_id, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 34c2b363e..e4fa72f46 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -85,8 +85,8 @@ class AnimeOnDemandIE(InfoExtractor):          if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):              error = self._search_regex( -                r'<p class="alert alert-danger">(.+?)</p>', -                response, 'error', default=None) +                r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>', +                response, 'error', default=None, group='error')              if error:                  raise ExtractorError('Unable to login: %s' % error, expected=True)              raise ExtractorError('Unable to log in') diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 915f8862e..ef73d5a93 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from .generic import GenericIE +from ..compat import compat_str  from ..utils import (      determine_ext,      ExtractorError, @@ -126,6 +127,8 @@ class ARDMediathekIE(InfoExtractor):                  quality = stream.get('_quality')                  server = stream.get('_server')                  for stream_url in stream_urls: +                    if not isinstance(stream_url, compat_str) or '//' not in stream_url: +                        continue                      ext = determine_ext(stream_url)                      if quality != 'auto' and ext in ('f4m', 'm3u8'):                          continue @@ -146,13 +149,11 @@ class ARDMediathekIE(InfoExtractor):                                  'play_path': stream_url,                                  'format_id': 'a%s-rtmp-%s' % (num, quality),                              } -                        elif stream_url.startswith('http'): +                        else:                              f = {                                  'url': stream_url,                                  'format_id': 'a%s-%s-%s' % (num, ext, quality)                              } -                        else: -                            continue                          m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)                          if m:                              f.update({ diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py new file mode 100644 index 000000000..670abce0c --- /dev/null +++ b/youtube_dl/extractor/aws.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import hashlib +import hmac + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlencode + + +class AWSIE(InfoExtractor): +    _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' +    _AWS_REGION = 'us-east-1' + +    def _aws_execute_api(self, aws_dict, video_id, query=None): +        query = query or {} +        amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') +        date = amz_date[:8] +        headers = { +            'Accept': 'application/json', +            'Host': self._AWS_PROXY_HOST, +            'X-Amz-Date': amz_date, +        } +        session_token = aws_dict.get('session_token') +        if session_token: +            headers['X-Amz-Security-Token'] = session_token +        headers['X-Api-Key'] = self._AWS_API_KEY + +        def aws_hash(s): +            return hashlib.sha256(s.encode('utf-8')).hexdigest() + +        # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +        canonical_querystring = compat_urllib_parse_urlencode(query) +        canonical_headers = '' +        for header_name, header_value in headers.items(): +            canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) +        signed_headers = ';'.join([header.lower() for header in headers.keys()]) +        canonical_request = '\n'.join([ +            'GET', +            aws_dict['uri'], +            canonical_querystring, +            canonical_headers, +            signed_headers, +            aws_hash('') +        ]) + +        # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html +        credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request'] +        credential_scope = '/'.join(credential_scope_list) +        string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)]) + +        # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html +        def aws_hmac(key, msg): +            return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + +        def aws_hmac_digest(key, msg): +            return aws_hmac(key, msg).digest() + +        def aws_hmac_hexdigest(key, msg): +            return aws_hmac(key, msg).hexdigest() + +        k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') +        for value in credential_scope_list: +            k_signing = aws_hmac_digest(k_signing, value) + +        signature = aws_hmac_hexdigest(k_signing, string_to_sign) + +        # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html +        headers['Authorization'] = ', '.join([ +            '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), +            'SignedHeaders=%s' % signed_headers, +            'Signature=%s' % signature, +        ]) + +        return self._download_json( +            'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), +            video_id, headers=headers) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 5525f7c9b..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -386,7 +386,7 @@ class BBCCoUkIE(InfoExtractor):                              m3u8_id=format_id, fatal=False))                          if re.search(self._USP_RE, href):                              usp_formats = self._extract_m3u8_formats( -                                re.sub(self._USP_RE, r'/\1\.ism/\1\.m3u8', href), +                                re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),                                  programme_id, ext='mp4', entry_protocol='m3u8_native',                                  m3u8_id=format_id, fatal=False)                              for f in usp_formats: diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0ed59bcbc..f04505011 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -464,7 +464,7 @@ class BrightcoveNewIE(AdobePassIE):              'timestamp': 1441391203,              'upload_date': '20150904',              'uploader_id': '929656772001', -            'formats': 'mincount:22', +            'formats': 'mincount:20',          },      }, {          # with rtmp streams @@ -478,7 +478,7 @@ class BrightcoveNewIE(AdobePassIE):              'timestamp': 1433556729,              'upload_date': '20150606',              'uploader_id': '4036320279001', -            'formats': 'mincount:41', +            'formats': 'mincount:39',          },          'params': {              # m3u8 download @@ -564,59 +564,7 @@ class BrightcoveNewIE(AdobePassIE):          return entries -    def _real_extract(self, url): -        url, smuggled_data = unsmuggle_url(url, {}) -        self._initialize_geo_bypass(smuggled_data.get('geo_countries')) - -        account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - -        webpage = self._download_webpage( -            'http://players.brightcove.net/%s/%s_%s/index.min.js' -            % (account_id, player_id, embed), video_id) - -        policy_key = None - -        catalog = self._search_regex( -            r'catalog\(({.+?})\);', webpage, 'catalog', default=None) -        if catalog: -            catalog = self._parse_json( -                js_to_json(catalog), video_id, fatal=False) -            if catalog: -                policy_key = catalog.get('policyKey') - -        if not policy_key: -            policy_key = self._search_regex( -                r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', -                webpage, 'policy key', group='pk') - -        api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) -        try: -            json_data = self._download_json(api_url, video_id, headers={ -                'Accept': 'application/json;pk=%s' % policy_key -            }) -        except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: -                json_data = self._parse_json(e.cause.read().decode(), video_id)[0] -                message = json_data.get('message') or json_data['error_code'] -                if json_data.get('error_subcode') == 'CLIENT_GEO': -                    self.raise_geo_restricted(msg=message) -                raise ExtractorError(message, expected=True) -            raise - -        errors = json_data.get('errors') -        if errors and errors[0].get('error_subcode') == 'TVE_AUTH': -            custom_fields = json_data['custom_fields'] -            tve_token = self._extract_mvpd_auth( -                smuggled_data['source_url'], video_id, -                custom_fields['bcadobepassrequestorid'], -                custom_fields['bcadobepassresourceid']) -            json_data = self._download_json( -                api_url, video_id, headers={ -                    'Accept': 'application/json;pk=%s' % policy_key -                }, query={ -                    'tveToken': tve_token, -                }) - +    def _parse_brightcove_metadata(self, json_data, video_id):          title = json_data['name'].strip()          formats = [] @@ -682,6 +630,7 @@ class BrightcoveNewIE(AdobePassIE):                      })                  formats.append(f) +        errors = json_data.get('errors')          if not formats and errors:              error = errors[0]              raise ExtractorError( @@ -708,9 +657,64 @@ class BrightcoveNewIE(AdobePassIE):              'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),              'duration': duration,              'timestamp': parse_iso8601(json_data.get('published_at')), -            'uploader_id': account_id, +            'uploader_id': json_data.get('account_id'),              'formats': formats,              'subtitles': subtitles,              'tags': json_data.get('tags', []),              'is_live': is_live,          } + +    def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +        self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + +        account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + +        webpage = self._download_webpage( +            'http://players.brightcove.net/%s/%s_%s/index.min.js' +            % (account_id, player_id, embed), video_id) + +        policy_key = None + +        catalog = self._search_regex( +            r'catalog\(({.+?})\);', webpage, 'catalog', default=None) +        if catalog: +            catalog = self._parse_json( +                js_to_json(catalog), video_id, fatal=False) +            if catalog: +                policy_key = catalog.get('policyKey') + +        if not policy_key: +            policy_key = self._search_regex( +                r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', +                webpage, 'policy key', group='pk') + +        api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) +        try: +            json_data = self._download_json(api_url, video_id, headers={ +                'Accept': 'application/json;pk=%s' % policy_key +            }) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: +                json_data = self._parse_json(e.cause.read().decode(), video_id)[0] +                message = json_data.get('message') or json_data['error_code'] +                if json_data.get('error_subcode') == 'CLIENT_GEO': +                    self.raise_geo_restricted(msg=message) +                raise ExtractorError(message, expected=True) +            raise + +        errors = json_data.get('errors') +        if errors and errors[0].get('error_subcode') == 'TVE_AUTH': +            custom_fields = json_data['custom_fields'] +            tve_token = self._extract_mvpd_auth( +                smuggled_data['source_url'], video_id, +                custom_fields['bcadobepassrequestorid'], +                custom_fields['bcadobepassresourceid']) +            json_data = self._download_json( +                api_url, video_id, headers={ +                    'Accept': 'application/json;pk=%s' % policy_key +                }, query={ +                    'tveToken': tve_token, +                }) + +        return self._parse_brightcove_metadata(json_data, video_id) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 8ef089653..4bf4efe1f 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -3,20 +3,19 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import ExtractorError  class BYUtvIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' +    _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?'      _TESTS = [{          'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',          'info_dict': { -            'id': '6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', +            'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH',              'display_id': 'studio-c-season-5-episode-5',              'ext': 'mp4',              'title': 'Season 5 Episode 5', -            'description': 'md5:e07269172baff037f8e8bf9956bc9747', -            'thumbnail': r're:^https?://.*\.jpg$', +            'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', +            'thumbnail': r're:^https?://.*',              'duration': 1486.486,          },          'params': { @@ -26,6 +25,9 @@ class BYUtvIE(InfoExtractor):      }, {          'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d',          'only_matching': True, +    }, { +        'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -33,16 +35,16 @@ class BYUtvIE(InfoExtractor):          video_id = mobj.group('id')          display_id = mobj.group('display_id') or video_id -        webpage = self._download_webpage(url, display_id) -        episode_code = self._search_regex( -            r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') - -        ep = self._parse_json( -            episode_code, display_id, transform_source=lambda s: -            re.sub(r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', s)) - -        if ep['providerType'] != 'Ooyala': -            raise ExtractorError('Unsupported provider %s' % ep['provider']) +        ep = self._download_json( +            'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, +            query={ +                'contentid': video_id, +                'channel': 'byutv', +                'x-byutv-context': 'web$US', +            }, headers={ +                'x-byutv-context': 'web$US', +                'x-byutv-platformkey': 'xsaaw9c7y5', +            })['ooyalaVOD']          return {              '_type': 'url_transparent', @@ -50,44 +52,7 @@ class BYUtvIE(InfoExtractor):              'url': 'ooyala:%s' % ep['providerId'],              'id': video_id,              'display_id': display_id, -            'title': ep['title'], +            'title': ep.get('title'),              'description': ep.get('description'),              'thumbnail': ep.get('imageThumbnail'),          } - - -class BYUtvEventIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?byutv\.org/watch/event/(?P<id>[0-9a-f-]+)' -    _TEST = { -        'url': 'http://www.byutv.org/watch/event/29941b9b-8bf6-48d2-aebf-7a87add9e34b', -        'info_dict': { -            'id': '29941b9b-8bf6-48d2-aebf-7a87add9e34b', -            'ext': 'mp4', -            'title': 'Toledo vs. BYU (9/30/16)', -        }, -        'params': { -            'skip_download': True, -        }, -        'add_ie': ['Ooyala'], -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        webpage = self._download_webpage(url, video_id) - -        ooyala_id = self._search_regex( -            r'providerId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', -            webpage, 'ooyala id', group='id') - -        title = self._search_regex( -            r'class=["\']description["\'][^>]*>\s*<h1>([^<]+)</h1>', webpage, -            'title').strip() - -        return { -            '_type': 'url_transparent', -            'ie_key': 'Ooyala', -            'url': 'ooyala:%s' % ooyala_id, -            'id': video_id, -            'title': title, -        } diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 7d78e3aae..90852a9ef 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -91,12 +91,10 @@ class CBSLocalIE(AnvatoIE):          info_dict = self._extract_anvato_videos(webpage, display_id) -        time_str = self._html_search_regex( -            r'class="entry-date">([^<]+)<', webpage, 'released date', default=None) -        if time_str: -            timestamp = unified_timestamp(time_str) -        else: -            timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage)) +        timestamp = unified_timestamp(self._html_search_regex( +            r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, +            'released date', default=None)) or parse_iso8601( +            self._html_search_meta('uploadDate', webpage))          info_dict.update({              'display_id': display_id, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 80a9c982f..3b79b8cb4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -301,8 +301,9 @@ class InfoExtractor(object):      There must be a key "entries", which is a list, an iterable, or a PagedList      object, each element of which is a valid dictionary by this specification. -    Additionally, playlists can have "title", "description" and "id" attributes -    with the same semantics as videos (see above). +    Additionally, playlists can have "id", "title", "description", "uploader", +    "uploader_id", "uploader_url" attributes with the same semantics as videos +    (see above).      _type "multi_video" indicates that there are multiple videos that @@ -494,6 +495,16 @@ class InfoExtractor(object):                  self.to_screen('%s' % (note,))              else:                  self.to_screen('%s: %s' % (video_id, note)) + +        # Some sites check X-Forwarded-For HTTP header in order to figure out +        # the origin of the client behind proxy. This allows bypassing geo +        # restriction by faking this header's value to IP that belongs to some +        # geo unrestricted country. We will do so once we encounter any +        # geo restriction error. +        if self._x_forwarded_for_ip: +            if 'X-Forwarded-For' not in headers: +                headers['X-Forwarded-For'] = self._x_forwarded_for_ip +          if isinstance(url_or_request, compat_urllib_request.Request):              url_or_request = update_Request(                  url_or_request, data=data, headers=headers, query=query) @@ -523,15 +534,6 @@ class InfoExtractor(object):          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] -        # Some sites check X-Forwarded-For HTTP header in order to figure out -        # the origin of the client behind proxy. This allows bypassing geo -        # restriction by faking this header's value to IP that belongs to some -        # geo unrestricted country. We will do so once we encounter any -        # geo restriction error. -        if self._x_forwarded_for_ip: -            if 'X-Forwarded-For' not in headers: -                headers['X-Forwarded-For'] = self._x_forwarded_for_ip -          urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)          if urlh is False:              assert not fatal diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index b53f2d705..b92f25447 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -392,7 +392,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                  'Downloading subtitles for ' + sub_name, data={                      'subtitle_script_id': sub_id,                  }) -            if not sub_doc: +            if sub_doc is None:                  continue              sid = sub_doc.get('id')              iv = xpath_text(sub_doc, 'iv', 'subtitle iv') @@ -479,9 +479,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                      'video_quality': stream_quality,                      'current_page': url,                  }) -            if streamdata: +            if streamdata is not None:                  stream_info = streamdata.find('./{default}preload/stream_info') -                if stream_info: +                if stream_info is not None:                      stream_infos.append(stream_info)              stream_info = self._call_rpc_api(                  'VideoEncode_GetStreamInfo', video_id, @@ -490,7 +490,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                      'video_format': stream_format,                      'video_encode_quality': stream_quality,                  }) -            if stream_info: +            if stream_info is not None:                  stream_infos.append(stream_info)              for stream_info in stream_infos:                  video_encode_id = xpath_text(stream_info, './video_encode_id') diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 171820e27..67d6df4b0 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -4,13 +4,14 @@ import re  from .common import InfoExtractor  from ..utils import ( -    int_or_none, -    unescapeHTML, -    find_xpath_attr, -    smuggle_url,      determine_ext,      ExtractorError,      extract_attributes, +    find_xpath_attr, +    get_element_by_class, +    int_or_none, +    smuggle_url, +    unescapeHTML,  )  from .senateisvp import SenateISVPIE  from .ustream import UstreamIE @@ -68,6 +69,10 @@ class CSpanIE(InfoExtractor):              'uploader': 'HouseCommittee',              'uploader_id': '12987475',          }, +    }, { +        # Audio Only +        'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights', +        'only_matching': True,      }]      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -111,7 +116,15 @@ class CSpanIE(InfoExtractor):                      title = self._og_search_title(webpage)                      surl = smuggle_url(senate_isvp_url, {'force_title': title})                      return self.url_result(surl, 'SenateISVP', video_id, title) +                video_id = self._search_regex( +                    r'jwsetup\.clipprog\s*=\s*(\d+);', +                    webpage, 'jwsetup program id', default=None) +                if video_id: +                    video_type = 'program'          if video_type is None or video_id is None: +            error_message = get_element_by_class('VLplayer-error-message', webpage) +            if error_message: +                raise ExtractorError(error_message)              raise ExtractorError('unable to find video id and type')          def get_text_attr(d, attr): @@ -138,7 +151,7 @@ class CSpanIE(InfoExtractor):          entries = []          for partnum, f in enumerate(files):              formats = [] -            for quality in f['qualities']: +            for quality in f.get('qualities', []):                  formats.append({                      'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),                      'url': unescapeHTML(get_text_attr(quality, 'file')), diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 21a2d0239..0e7d587dd 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -413,52 +413,3 @@ class DailymotionUserIE(DailymotionPlaylistIE):              'title': full_user,              'entries': self._extract_entries(user),          } - - -class DailymotionCloudIE(DailymotionBaseInfoExtractor): -    _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' -    _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX -    _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX - -    _TESTS = [{ -        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html -        # Tested at FranceTvInfo_2 -        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', -        'only_matching': True, -    }, { -        # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html -        'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', -        'only_matching': True, -    }] - -    @classmethod -    def _extract_dmcloud_url(cls, webpage): -        mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) -        if mobj: -            return mobj.group(1) - -        mobj = re.search( -            r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, -            webpage) -        if mobj: -            return mobj.group(1) - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        webpage = self._download_webpage_no_ff(url, video_id) - -        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') - -        video_info = self._parse_json(self._search_regex( -            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) - -        # TODO: parse ios_url, which is in fact a manifest -        video_url = video_info['mp4_url'] - -        return { -            'id': video_id, -            'url': video_url, -            'title': title, -            'thumbnail': video_info.get('thumbnail_url'), -        } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 55853f76f..f9cec1d23 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,14 +1,18 @@  from __future__ import unicode_literals -from .common import InfoExtractor +import random +import re +import string + +from .discoverygo import DiscoveryGoBaseIE  from ..utils import ( -    parse_duration, -    parse_iso8601, +    ExtractorError, +    update_url_query,  ) -from ..compat import compat_str +from ..compat import compat_HTTPError -class DiscoveryIE(InfoExtractor): +class DiscoveryIE(DiscoveryGoBaseIE):      _VALID_URL = r'''(?x)https?://(?:www\.)?(?:              discovery|              investigationdiscovery| @@ -19,79 +23,65 @@ class DiscoveryIE(InfoExtractor):              sciencechannel|              tlc|              velocity -        )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)''' +        )\.com(?P<path>/tv-shows/[^/]+/(?:video|full-episode)s/(?P<id>[^./?#]+))'''      _TESTS = [{ -        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', +        'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley',          'info_dict': { -            'id': '20769', +            'id': '5a2d9b4d6b66d17a5026e1fd',              'ext': 'mp4', -            'title': 'Mission Impossible Outtakes', -            'description': ('Watch Jamie Hyneman and Adam Savage practice being' -                            ' each other -- to the point of confusing Jamie\'s dog -- and ' -                            'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' -                            ' back.'), -            'duration': 156, -            'timestamp': 1302032462, -            'upload_date': '20110405', -            'uploader_id': '103207', +            'title': 'Dave Foley', +            'description': 'md5:4b39bcafccf9167ca42810eb5f28b01f', +            'duration': 608,          },          'params': {              'skip_download': True,  # requires ffmpeg          }      }, { -        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', -        'info_dict': { -            'id': 'mythbusters-the-simpsons', -            'title': 'MythBusters: The Simpsons', -        }, -        'playlist_mincount': 10, -    }, { -        'url': 'http://www.animalplanet.com/longfin-eels-maneaters/', -        'info_dict': { -            'id': '78326', -            'ext': 'mp4', -            'title': 'Longfin Eels: Maneaters?', -            'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.', -            'upload_date': '20140725', -            'timestamp': 1406246400, -            'duration': 116, -            'uploader_id': '103207', -        }, -        'params': { -            'skip_download': True,  # requires ffmpeg -        } +        'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', +        'only_matching': True,      }] +    _GEO_COUNTRIES = ['US'] +    _GEO_BYPASS = False      def _real_extract(self, url): -        display_id = self._match_id(url) -        info = self._download_json(url + '?flat=1', display_id) - -        video_title = info.get('playlist_title') or info.get('video_title') +        path, display_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, display_id) -        entries = [] +        react_data = self._parse_json(self._search_regex( +            r'window\.__reactTransmitPacket\s*=\s*({.+?});', +            webpage, 'react data'), display_id) +        content_blocks = react_data['layout'][path]['contentBlocks'] +        video = next(cb for cb in content_blocks if cb.get('type') == 'video')['content']['items'][0] +        video_id = video['id'] -        for idx, video_info in enumerate(info['playlist']): -            subtitles = {} -            caption_url = video_info.get('captionsUrl') -            if caption_url: -                subtitles = { -                    'en': [{ -                        'url': caption_url, -                    }] -                } +        access_token = self._download_json( +            'https://www.discovery.com/anonymous', display_id, query={ +                'authLink': update_url_query( +                    'https://login.discovery.com/v1/oauth2/authorize', { +                        'client_id': react_data['application']['apiClientId'], +                        'redirect_uri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html', +                        'response_type': 'anonymous', +                        'state': 'nonce,' + ''.join([random.choice(string.ascii_letters) for _ in range(32)]), +                    }) +            })['access_token'] -            entries.append({ -                '_type': 'url_transparent', -                'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'], -                'id': compat_str(video_info['id']), -                'title': video_info['title'], -                'description': video_info.get('description'), -                'duration': parse_duration(video_info.get('video_length')), -                'webpage_url': video_info.get('href') or video_info.get('url'), -                'thumbnail': video_info.get('thumbnailURL'), -                'alt_title': video_info.get('secondary_title'), -                'timestamp': parse_iso8601(video_info.get('publishedDate')), -                'subtitles': subtitles, -            }) +        try: +            stream = self._download_json( +                'https://api.discovery.com/v1/streaming/video/' + video_id, +                display_id, headers={ +                    'Authorization': 'Bearer ' + access_token, +                }) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: +                e_description = self._parse_json( +                    e.cause.read().decode(), display_id)['description'] +                if 'resource not available for country' in e_description: +                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES) +                if 'Authorized Networks' in e_description: +                    raise ExtractorError( +                        'This video is only available via cable service provider subscription that' +                        ' is not currently supported. You may want to use --cookies.', expected=True) +                raise ExtractorError(e_description) +            raise -        return self.playlist_result(entries, display_id, video_title) +        return self._extract_video_info(video, stream, display_id) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 7cd5d4291..3368c4c07 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import ( +    determine_ext,      extract_attributes,      ExtractorError,      int_or_none, @@ -27,42 +28,9 @@ class DiscoveryGoBaseIE(InfoExtractor):              velocitychannel          )go\.com/%s(?P<id>[^/?#&]+)''' - -class DiscoveryGoIE(DiscoveryGoBaseIE): -    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' -    _GEO_COUNTRIES = ['US'] -    _TEST = { -        'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', -        'info_dict': { -            'id': '58c167d86b66d12f2addeb01', -            'ext': 'mp4', -            'title': 'Reaper Madness', -            'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', -            'duration': 2519, -            'series': 'Bering Sea Gold', -            'season_number': 8, -            'episode_number': 6, -            'age_limit': 14, -        }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) - -        webpage = self._download_webpage(url, display_id) - -        container = extract_attributes( -            self._search_regex( -                r'(<div[^>]+class=["\']video-player-container[^>]+>)', -                webpage, 'video container')) - -        video = self._parse_json( -            container.get('data-video') or container.get('data-json'), -            display_id) - +    def _extract_video_info(self, video, stream, display_id):          title = video['name'] -        stream = video.get('stream')          if not stream:              if video.get('authenticated') is True:                  raise ExtractorError( @@ -106,7 +74,11 @@ class DiscoveryGoIE(DiscoveryGoBaseIE):                          not subtitle_url.startswith('http')):                      continue                  lang = caption.get('fileLang', 'en') -                subtitles.setdefault(lang, []).append({'url': subtitle_url}) +                ext = determine_ext(subtitle_url) +                subtitles.setdefault(lang, []).append({ +                    'url': subtitle_url, +                    'ext': 'ttml' if ext == 'xml' else ext, +                })          return {              'id': video_id, @@ -124,6 +96,43 @@ class DiscoveryGoIE(DiscoveryGoBaseIE):          } +class DiscoveryGoIE(DiscoveryGoBaseIE): +    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' +    _GEO_COUNTRIES = ['US'] +    _TEST = { +        'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', +        'info_dict': { +            'id': '58c167d86b66d12f2addeb01', +            'ext': 'mp4', +            'title': 'Reaper Madness', +            'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', +            'duration': 2519, +            'series': 'Bering Sea Gold', +            'season_number': 8, +            'episode_number': 6, +            'age_limit': 14, +        }, +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        container = extract_attributes( +            self._search_regex( +                r'(<div[^>]+class=["\']video-player-container[^>]+>)', +                webpage, 'video container')) + +        video = self._parse_json( +            container.get('data-video') or container.get('data-json'), +            display_id) + +        stream = video.get('stream') + +        return self._extract_video_info(video, stream, display_id) + +  class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE):      _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % ''      _TEST = { diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 968c4c7fd..0eee82fd6 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -10,6 +10,7 @@ from ..utils import (      compat_str,      determine_ext,      ExtractorError, +    update_url_query,  ) @@ -108,9 +109,16 @@ class DisneyIE(InfoExtractor):                  continue              tbr = int_or_none(flavor.get('bitrate'))              if tbr == 99999: -                formats.extend(self._extract_m3u8_formats( +                # wrong ks(Kaltura Signature) causes 404 Error +                flavor_url = update_url_query(flavor_url, {'ks': ''}) +                m3u8_formats = self._extract_m3u8_formats(                      flavor_url, video_id, 'mp4', -                    m3u8_id=flavor_format, fatal=False)) +                    m3u8_id=flavor_format, fatal=False) +                for f in m3u8_formats: +                    # Apple FairPlay +                    if '/fpshls/' in f['url']: +                        continue +                    formats.append(f)                  continue              format_id = []              if flavor_format: diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py new file mode 100644 index 000000000..544473274 --- /dev/null +++ b/youtube_dl/extractor/ellentube.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    extract_attributes, +    float_or_none, +    int_or_none, +    try_get, +) + + +class EllenTubeBaseIE(InfoExtractor): +    def _extract_data_config(self, webpage, video_id): +        details = self._search_regex( +            r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?></div>)', webpage, +            'details') +        return self._parse_json( +            extract_attributes(details)['data-config'], video_id) + +    def _extract_video(self, data, video_id): +        title = data['title'] + +        formats = [] +        duration = None +        for entry in data.get('media'): +            if entry.get('id') == 'm3u8': +                formats = self._extract_m3u8_formats( +                    entry['url'], video_id, 'mp4', +                    entry_protocol='m3u8_native', m3u8_id='hls') +                duration = int_or_none(entry.get('duration')) +                break +        self._sort_formats(formats) + +        def get_insight(kind): +            return int_or_none(try_get( +                data, lambda x: x['insight']['%ss' % kind])) + +        return { +            'extractor_key': EllenTubeIE.ie_key(), +            'id': video_id, +            'title': title, +            'description': data.get('description'), +            'duration': duration, +            'thumbnail': data.get('thumbnail'), +            'timestamp': float_or_none(data.get('publishTime'), scale=1000), +            'view_count': get_insight('view'), +            'like_count': get_insight('like'), +            'formats': formats, +        } + + +class EllenTubeIE(EllenTubeBaseIE): +    _VALID_URL = r'''(?x) +                        (?: +                            ellentube:| +                            https://api-prod\.ellentube\.com/ellenapi/api/item/ +                        ) +                        (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) +                    ''' +    _TESTS = [{ +        'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3', +        'md5': '2fabc277131bddafdd120e0fc0f974c9', +        'info_dict': { +            'id': '0822171c-3829-43bf-b99f-d77358ae75e3', +            'ext': 'mp4', +            'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', +            'description': 'md5:76e3355e2242a78ad9e3858e5616923f', +            'thumbnail': r're:^https?://.+?', +            'duration': 514, +            'timestamp': 1508505120, +            'upload_date': '20171020', +            'view_count': int, +            'like_count': int, +        } +    }, { +        'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        data = self._download_json( +            'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, +            video_id) +        return self._extract_video(data, video_id) + + +class EllenTubeVideoIE(EllenTubeBaseIE): +    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+?)\.html' +    _TEST = { +        'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', +        'only_matching': True, +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        video_id = self._extract_data_config(webpage, display_id)['id'] +        return self.url_result( +            'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), +            video_id=video_id) + + +class EllenTubePlaylistIE(EllenTubeBaseIE): +    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+?)\.html' +    _TESTS = [{ +        'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', +        'info_dict': { +            'id': 'dax-shepard-jordan-fisher-haim', +            'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", +            'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c', +        }, +        'playlist_count': 6, +    }, { +        'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        data = self._extract_data_config(webpage, display_id)['data'] +        feed = self._download_json( +            'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' +            % data['filter'], display_id) +        entries = [ +            self._extract_video(elem, elem['id']) +            for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] +        return self.playlist_result( +            entries, display_id, data.get('title'), +            clean_html(data.get('description'))) diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py deleted file mode 100644 index e0a13dd76..000000000 --- a/youtube_dl/extractor/ellentv.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import NO_DEFAULT - - -class EllenTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' -    _TESTS = [{ -        'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', -        'md5': '4294cf98bc165f218aaa0b89e0fd8042', -        'info_dict': { -            'id': '0_ipq1gsai', -            'ext': 'mov', -            'title': 'Fast Fingers of Fate', -            'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', -            'timestamp': 1428035648, -            'upload_date': '20150403', -            'uploader_id': 'batchUser', -        }, -    }, { -        # not available via http://widgets.ellentube.com/ -        'url': 'http://www.ellentv.com/videos/1-szkgu2m2/', -        'info_dict': { -            'id': '1_szkgu2m2', -            'ext': 'flv', -            'title': "Ellen's Amazingly Talented Audience", -            'description': 'md5:86ff1e376ff0d717d7171590e273f0a5', -            'timestamp': 1255140900, -            'upload_date': '20091010', -            'uploader_id': 'ellenkaltura@gmail.com', -        }, -        'params': { -            'skip_download': True, -        }, -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url) - -        for num, url_ in enumerate(URLS, 1): -            webpage = self._download_webpage( -                url_, video_id, fatal=num == len(URLS)) - -            default = NO_DEFAULT if num == len(URLS) else None - -            partner_id = self._search_regex( -                r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id', -                default=default) - -            kaltura_id = self._search_regex( -                [r'id="kaltura_player_([^"]+)"', -                 r"_wb_entry_id\s*:\s*'([^']+)", -                 r'data-kaltura-entry-id="([^"]+)'], -                webpage, 'kaltura id', default=default) - -            if partner_id and kaltura_id: -                break - -        return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) - - -class EllenTVClipsIE(InfoExtractor): -    IE_NAME = 'EllenTV:clips' -    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' -    _TEST = { -        'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', -        'info_dict': { -            'id': 'meryl-streep-vanessa-hudgens', -            'title': 'Meryl Streep, Vanessa Hudgens', -        }, -        'playlist_mincount': 5, -    } - -    def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        webpage = self._download_webpage(url, playlist_id) -        playlist = self._extract_playlist(webpage, playlist_id) - -        return { -            '_type': 'playlist', -            'id': playlist_id, -            'title': self._og_search_title(webpage), -            'entries': self._extract_entries(playlist) -        } - -    def _extract_playlist(self, webpage, playlist_id): -        json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') -        return self._parse_json('[{' + json_string + '}]', playlist_id) - -    def _extract_entries(self, playlist): -        return [ -            self.url_result( -                'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), -                KalturaIE.ie_key(), video_id=item['kaltura_entry_id']) -            for item in playlist] diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 7a7436068..127c69b2e 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -1,6 +1,9 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from .once import OnceIE  from ..compat import compat_str  from ..utils import (      determine_ext, @@ -9,22 +12,27 @@ from ..utils import (  ) -class ESPNIE(InfoExtractor): +class ESPNIE(OnceIE):      _VALID_URL = r'''(?x)                      https?://                          (?: -                            (?:(?:\w+\.)+)?espn\.go| -                            (?:www\.)?espn -                        )\.com/ -                        (?: -                            (?: -                                video/clip| -                                watch/player -                            )                              (?: -                                \?.*?\bid=| -                                /_/id/ -                            ) +                                (?: +                                    (?:(?:\w+\.)+)?espn\.go| +                                    (?:www\.)?espn +                                )\.com/ +                                (?: +                                    (?: +                                        video/(?:clip|iframe/twitter)| +                                        watch/player +                                    ) +                                    (?: +                                        .*?\?.*?\bid=| +                                        /_/id/ +                                    ) +                                ) +                            )| +                            (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/                          )                          (?P<id>\d+)                      ''' @@ -77,6 +85,15 @@ class ESPNIE(InfoExtractor):      }, {          'url': 'http://www.espn.com/video/clip/_/id/17989860',          'only_matching': True, +    }, { +        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', +        'only_matching': True, +    }, { +        'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls', +        'only_matching': True, +    }, { +        'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -93,7 +110,9 @@ class ESPNIE(InfoExtractor):          def traverse_source(source, base_source_id=None):              for source_id, source in source.items(): -                if isinstance(source, compat_str): +                if source_id == 'alert': +                    continue +                elif isinstance(source, compat_str):                      extract_source(source, base_source_id)                  elif isinstance(source, dict):                      traverse_source( @@ -106,7 +125,9 @@ class ESPNIE(InfoExtractor):                  return              format_urls.add(source_url)              ext = determine_ext(source_url) -            if ext == 'smil': +            if OnceIE.suitable(source_url): +                formats.extend(self._extract_once_formats(source_url)) +            elif ext == 'smil':                  formats.extend(self._extract_smil_formats(                      source_url, video_id, fatal=False))              elif ext == 'f4m': @@ -117,12 +138,24 @@ class ESPNIE(InfoExtractor):                      source_url, video_id, 'mp4', entry_protocol='m3u8_native',                      m3u8_id=source_id, fatal=False))              else: -                formats.append({ +                f = {                      'url': source_url,                      'format_id': source_id, -                }) - -        traverse_source(clip['links']['source']) +                } +                mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url) +                if mobj: +                    f.update({ +                        'height': int(mobj.group(1)), +                        'fps': int(mobj.group(2)), +                        'tbr': int(mobj.group(3)), +                    }) +                if source_id == 'mezzanine': +                    f['preference'] = 1 +                formats.append(f) + +        links = clip.get('links', {}) +        traverse_source(links.get('source', {})) +        traverse_source(links.get('mobile', {}))          self._sort_formats(formats)          description = clip.get('caption') or clip.get('description') @@ -144,9 +177,6 @@ class ESPNIE(InfoExtractor):  class ESPNArticleIE(InfoExtractor):      _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)'      _TESTS = [{ -        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', -        'only_matching': True, -    }, {          'url': 'http://espn.go.com/nba/recap?gameId=400793786',          'only_matching': True,      }, { @@ -175,3 +205,34 @@ class ESPNArticleIE(InfoExtractor):          return self.url_result(              'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + + +class FiveThirtyEightIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)' +    _TEST = { +        'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/', +        'info_dict': { +            'id': '21846851', +            'ext': 'mp4', +            'title': 'FiveThirtyEight: The Raiders can still make the playoffs', +            'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.', +            'timestamp': 1513960621, +            'upload_date': '20171222', +        }, +        'params': { +            'skip_download': True, +        }, +        'expected_warnings': ['Unable to download f4m manifest'], +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_id = self._search_regex( +            r'data-video-id=["\'](?P<id>\d+)', +            webpage, 'video id', group='id') + +        return self.url_result( +            'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f1ea735b5..e863f03ab 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -138,10 +138,7 @@ from .brightcove import (      BrightcoveNewIE,  )  from .buzzfeed import BuzzFeedIE -from .byutv import ( -    BYUtvIE, -    BYUtvEventIE, -) +from .byutv import BYUtvIE  from .c56 import C56IE  from .camdemy import (      CamdemyIE, @@ -246,7 +243,6 @@ from .dailymotion import (      DailymotionIE,      DailymotionPlaylistIE,      DailymotionUserIE, -    DailymotionCloudIE,  )  from .daisuki import (      DaisukiMottoIE, @@ -312,9 +308,10 @@ from .ehow import EHowIE  from .eighttracks import EightTracksIE  from .einthusan import EinthusanIE  from .eitb import EitbIE -from .ellentv import ( -    EllenTVIE, -    EllenTVClipsIE, +from .ellentube import ( +    EllenTubeIE, +    EllenTubeVideoIE, +    EllenTubePlaylistIE,  )  from .elpais import ElPaisIE  from .embedly import EmbedlyIE @@ -325,6 +322,7 @@ from .escapist import EscapistIE  from .espn import (      ESPNIE,      ESPNArticleIE, +    FiveThirtyEightIE,  )  from .esri import EsriVideoIE  from .etonline import ETOnlineIE @@ -689,6 +687,7 @@ from .nhl import (  )  from .nick import (      NickIE, +    NickBrIE,      NickDeIE,      NickNightIE,      NickRuIE, @@ -721,10 +720,6 @@ from .nowness import (      NownessPlaylistIE,      NownessSeriesIE,  ) -from .nowtv import ( -    NowTVIE, -    NowTVListIE, -)  from .noz import NozIE  from .npo import (      AndereTijdenIE, @@ -857,6 +852,7 @@ from .radiofrance import RadioFranceIE  from .rai import (      RaiPlayIE,      RaiPlayLiveIE, +    RaiPlayPlaylistIE,      RaiIE,  )  from .rbmaradio import RBMARadioIE @@ -931,8 +927,12 @@ from .senateisvp import SenateISVPIE  from .sendtonews import SendtoNewsIE  from .servingsys import ServingSysIE  from .servus import ServusIE +from .sevenplus import SevenPlusIE  from .sexu import SexuIE -from .shahid import ShahidIE +from .shahid import ( +    ShahidIE, +    ShahidShowIE, +)  from .shared import (      SharedIE,      VivoIE, @@ -1000,6 +1000,7 @@ from .streamango import StreamangoIE  from .streamcloud import StreamcloudIE  from .streamcz import StreamCZIE  from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE  from .sunporno import SunPornoIE  from .svt import (      SVTIE, @@ -1102,6 +1103,10 @@ from .tvigle import TvigleIE  from .tvland import TVLandIE  from .tvn24 import TVN24IE  from .tvnoe import TVNoeIE +from .tvnow import ( +    TVNowIE, +    TVNowListIE, +)  from .tvp import (      TVPEmbedIE,      TVPIE, @@ -1139,6 +1144,7 @@ from .udemy import (  from .udn import UDNEmbedIE  from .uktvplay import UKTVPlayIE  from .digiteka import DigitekaIE +from .umg import UMGDeIE  from .unistra import UnistraIE  from .unity import UnityIE  from .uol import UOLIE diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 5f98d017b..11d6c9c32 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -11,6 +11,7 @@ from ..utils import (      parse_duration,      try_get,      unified_timestamp, +    update_url_query,  ) @@ -62,7 +63,8 @@ class FOXIE(AdobePassIE):          duration = int_or_none(video.get('durationInSeconds')) or int_or_none(              video.get('duration')) or parse_duration(video.get('duration'))          timestamp = unified_timestamp(video.get('datePublished')) -        age_limit = parse_age_limit(video.get('contentRating')) +        rating = video.get('contentRating') +        age_limit = parse_age_limit(rating)          data = try_get(              video, lambda x: x['trackingData']['properties'], dict) or {} @@ -77,8 +79,24 @@ class FOXIE(AdobePassIE):          release_year = int_or_none(video.get('releaseYear'))          if data.get('authRequired'): -            # TODO: AP -            pass +            resource = self._get_mvpd_resource( +                'fbc-fox', title, video.get('guid'), rating) +            release_url = update_url_query( +                release_url, { +                    'auth': self._extract_mvpd_auth( +                        url, video_id, 'fbc-fox', resource) +                }) + +        subtitles = {} +        for doc_rel in video.get('documentReleases', []): +            rel_url = doc_rel.get('url') +            if not url or doc_rel.get('format') != 'SCC': +                continue +            subtitles['en'] = [{ +                'url': rel_url, +                'ext': 'scc', +            }] +            break          info = {              'id': video_id, @@ -93,6 +111,7 @@ class FOXIE(AdobePassIE):              'episode': episode,              'episode_number': episode_number,              'release_year': release_year, +            'subtitles': subtitles,          }          urlh = self._request_webpage(HEADRequest(release_url), video_id) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 5a3abeaff..095bb3954 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -13,10 +13,7 @@ from ..utils import (      parse_duration,      determine_ext,  ) -from .dailymotion import ( -    DailymotionIE, -    DailymotionCloudIE, -) +from .dailymotion import DailymotionIE  class FranceTVBaseInfoExtractor(InfoExtractor): @@ -290,10 +287,6 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title) -        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) -        if dmcloud_url: -            return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) -          dailymotion_urls = DailymotionIE._extract_urls(webpage)          if dailymotion_urls:              return self.playlist_result([ @@ -363,6 +356,7 @@ class CultureboxIE(FranceTVBaseInfoExtractor):              raise ExtractorError('Video %s is not available' % name, expected=True)          video_id, catalogue = self._search_regex( -            r'"https?://videos\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video id').split('@') +            r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', +            webpage, 'video id').split('@')          return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 56df2ab47..c7b609215 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,10 +59,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE  from .drtuber import DrTuberIE  from .redtube import RedTubeIE  from .vimeo import VimeoIE -from .dailymotion import ( -    DailymotionIE, -    DailymotionCloudIE, -) +from .dailymotion import DailymotionIE  from .dailymail import DailyMailIE  from .onionstudios import OnionStudiosIE  from .viewlift import ViewLiftEmbedIE @@ -1472,23 +1469,6 @@ class GenericIE(InfoExtractor):                  'timestamp': 1432570283,              },          }, -        # Dailymotion Cloud video -        { -            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', -            'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', -            'info_dict': { -                'id': 'x2uy8t3', -                'ext': 'mp4', -                'title': 'Sauvons les abeilles ! - Le débat', -                'description': 'md5:d9082128b1c5277987825d684939ca26', -                'thumbnail': r're:^https?://.*\.jpe?g$', -                'timestamp': 1434970506, -                'upload_date': '20150622', -                'uploader': 'Public Sénat', -                'uploader_id': 'xa9gza', -            }, -            'skip': 'File not found.', -        },          # OnionStudios embed          {              'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -2195,7 +2175,7 @@ class GenericIE(InfoExtractor):                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):                  info_dict['formats'] = self._parse_mpd_formats( -                    doc, video_id, +                    doc,                      mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],                      mpd_url=url)                  self._sort_formats(info_dict['formats']) @@ -2704,11 +2684,6 @@ class GenericIE(InfoExtractor):          if senate_isvp_url:              return self.url_result(senate_isvp_url, 'SenateISVP') -        # Look for Dailymotion Cloud videos -        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) -        if dmcloud_url: -            return self.url_result(dmcloud_url, 'DailymotionCloud') -          # Look for OnionStudios embeds          onionstudios_url = OnionStudiosIE._extract_url(webpage)          if onionstudios_url: diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 413a219dc..18a7d7f8c 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -26,7 +26,7 @@ from ..utils import (  class ITVIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'      _GEO_COUNTRIES = ['GB'] -    _TEST = { +    _TESTS = [{          'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',          'info_dict': {              'id': '2a2936a0053', @@ -37,7 +37,11 @@ class ITVIE(InfoExtractor):              # rtmp download              'skip_download': True,          }, -    } +    }, { +        # unavailable via data-playlist-url +        'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -101,6 +105,18 @@ class ITVIE(InfoExtractor):              'Content-Type': 'text/xml; charset=utf-8',              'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',          }) + +        info = self._search_json_ld(webpage, video_id, default={}) +        formats = [] +        subtitles = {} + +        def extract_subtitle(sub_url): +            ext = determine_ext(sub_url, 'ttml') +            subtitles.setdefault('en', []).append({ +                'url': sub_url, +                'ext': 'ttml' if ext == 'xml' else ext, +            }) +          resp_env = self._download_xml(              params['data-playlist-url'], video_id,              headers=headers, data=etree.tostring(req_env)) @@ -111,37 +127,55 @@ class ITVIE(InfoExtractor):              if fault_code == 'InvalidGeoRegion':                  self.raise_geo_restricted(                      msg=fault_string, countries=self._GEO_COUNTRIES) -            raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) -        title = xpath_text(playlist, 'EpisodeTitle', fatal=True) -        video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) -        media_files = xpath_element(video_element, 'MediaFiles', fatal=True) -        rtmp_url = media_files.attrib['base'] +            elif fault_code != 'InvalidEntity': +                raise ExtractorError( +                    '%s said: %s' % (self.IE_NAME, fault_string), expected=True) +            info.update({ +                'title': self._og_search_title(webpage), +                'episode_title': params.get('data-video-episode'), +                'series': params.get('data-video-title'), +            }) +        else: +            title = xpath_text(playlist, 'EpisodeTitle', default=None) +            info.update({ +                'title': title, +                'episode_title': title, +                'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), +                'series': xpath_text(playlist, 'ProgrammeTitle'), +                'duration': parse_duration(xpath_text(playlist, 'Duration')), +            }) +            video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) +            media_files = xpath_element(video_element, 'MediaFiles', fatal=True) +            rtmp_url = media_files.attrib['base'] -        formats = [] -        for media_file in media_files.findall('MediaFile'): -            play_path = xpath_text(media_file, 'URL') -            if not play_path: -                continue -            tbr = int_or_none(media_file.get('bitrate'), 1000) -            f = { -                'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), -                'play_path': play_path, -                # Providing this swfVfy allows to avoid truncated downloads -                'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', -                'page_url': url, -                'tbr': tbr, -                'ext': 'flv', -            } -            app = self._search_regex( -                'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) -            if app: -                f.update({ -                    'url': rtmp_url.split('?', 1)[0], -                    'app': app, -                }) -            else: -                f['url'] = rtmp_url -            formats.append(f) +            for media_file in media_files.findall('MediaFile'): +                play_path = xpath_text(media_file, 'URL') +                if not play_path: +                    continue +                tbr = int_or_none(media_file.get('bitrate'), 1000) +                f = { +                    'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), +                    'play_path': play_path, +                    # Providing this swfVfy allows to avoid truncated downloads +                    'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', +                    'page_url': url, +                    'tbr': tbr, +                    'ext': 'flv', +                } +                app = self._search_regex( +                    'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) +                if app: +                    f.update({ +                        'url': rtmp_url.split('?', 1)[0], +                        'app': app, +                    }) +                else: +                    f['url'] = rtmp_url +                formats.append(f) + +            for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): +                if caption_url.text: +                    extract_subtitle(caption_url.text)          ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')          hmac = params.get('data-video-hmac') @@ -198,27 +232,22 @@ class ITVIE(InfoExtractor):                          formats.append({                              'url': href,                          }) -        self._sort_formats(formats) +                subs = video_data.get('Subtitles') +                if isinstance(subs, list): +                    for sub in subs: +                        if not isinstance(sub, dict): +                            continue +                        href = sub.get('Href') +                        if isinstance(href, compat_str): +                            extract_subtitle(href) +                if not info.get('duration'): +                    info['duration'] = parse_duration(video_data.get('Duration')) -        subtitles = {} -        for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): -            if not caption_url.text: -                continue -            ext = determine_ext(caption_url.text, 'ttml') -            subtitles.setdefault('en', []).append({ -                'url': caption_url.text, -                'ext': 'ttml' if ext == 'xml' else ext, -            }) +        self._sort_formats(formats) -        info = self._search_json_ld(webpage, video_id, default={})          info.update({              'id': video_id, -            'title': title,              'formats': formats,              'subtitles': subtitles, -            'episode_title': title, -            'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), -            'series': xpath_text(playlist, 'ProgrammeTitle'), -            'duartion': parse_duration(xpath_text(playlist, 'Duration')),          })          return info diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index bdac2df3e..562e25f6d 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -125,9 +125,12 @@ class KalturaIE(InfoExtractor):                          (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*                      (?P=q1).*?                      (?: -                        entry_?[Ii]d| -                        (?P<q2>["'])entry_?[Ii]d(?P=q2) -                    )\s*:\s* +                        (?: +                            entry_?[Ii]d| +                            (?P<q2>["'])entry_?[Ii]d(?P=q2) +                        )\s*:\s*| +                        \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* +                    )                      (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)                  ''', webpage) or              re.search( diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f7cc3c832..6b7c5e3e0 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,8 +13,15 @@ from ..utils import (  class MailRuIE(InfoExtractor):      IE_NAME = 'mailru'      IE_DESC = 'Видео@Mail.Ru' -    _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' - +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:(?:www|m)\.)?my\.mail\.ru/ +                        (?: +                            video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)| +                            (?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html| +                            (?:video/embed|\+/video/meta)/(?P<metaid>\d+) +                        ) +                    '''      _TESTS = [          {              'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', @@ -23,7 +30,7 @@ class MailRuIE(InfoExtractor):                  'id': '46301138_76',                  'ext': 'mp4',                  'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', -                'timestamp': 1393232740, +                'timestamp': 1393235077,                  'upload_date': '20140224',                  'uploader': 'sonypicturesrus',                  'uploader_id': 'sonypicturesrus@mail.ru', @@ -40,7 +47,7 @@ class MailRuIE(InfoExtractor):                  'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',                  'timestamp': 1397039888,                  'upload_date': '20140409', -                'uploader': 'hitech@corp.mail.ru', +                'uploader': 'hitech',                  'uploader_id': 'hitech@corp.mail.ru',                  'duration': 245,              }, @@ -65,28 +72,42 @@ class MailRuIE(InfoExtractor):          {              'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',              'only_matching': True, +        }, +        { +            'url': 'https://my.mail.ru/video/embed/7949340477499637815', +            'only_matching': True, +        }, +        { +            'url': 'http://my.mail.ru/+/video/meta/7949340477499637815', +            'only_matching': True,          }      ]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('idv1') - -        if not video_id: -            video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') - -        webpage = self._download_webpage(url, video_id) +        meta_id = mobj.group('metaid') + +        video_id = None +        if meta_id: +            meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id +        else: +            video_id = mobj.group('idv1') +            if not video_id: +                video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') +            webpage = self._download_webpage(url, video_id) +            page_config = self._parse_json(self._search_regex( +                r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', +                webpage, 'page config', default='{}'), video_id, fatal=False) +            if page_config: +                meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') +            else: +                meta_url = None          video_data = None - -        page_config = self._parse_json(self._search_regex( -            r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', -            webpage, 'page config', default='{}'), video_id, fatal=False) -        if page_config: -            meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') -            if meta_url: -                video_data = self._download_json( -                    meta_url, video_id, 'Downloading video meta JSON', fatal=False) +        if meta_url: +            video_data = self._download_json( +                meta_url, video_id or meta_id, 'Downloading video meta JSON', +                fatal=not video_id)          # Fallback old approach          if not video_data: diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 310eea2cf..7edd68472 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -10,7 +10,7 @@ from ..utils import update_url_query  class NickIE(MTVServicesInfoExtractor):      # None of videos on the website are still alive?      IE_NAME = 'nick.com' -    _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' +    _VALID_URL = r'https?://(?P<domain>(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'      _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'      _GEO_COUNTRIES = ['US']      _TESTS = [{ @@ -69,8 +69,59 @@ class NickIE(MTVServicesInfoExtractor):              'mgid': uri,          } -    def _extract_mgid(self, webpage): -        return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') +    def _real_extract(self, url): +        domain, display_id = re.match(self._VALID_URL, url).groups() +        video_data = self._download_json( +            'http://%s/data/video.endLevel.json' % domain, +            display_id, query={ +                'urlKey': display_id, +            }) +        return self._get_videos_info(video_data['player'] + video_data['id']) + + +class NickBrIE(MTVServicesInfoExtractor): +    IE_NAME = 'nickelodeon:br' +    _VALID_URL = r'https?://(?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br/(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?#.]+)' +    _TESTS = [{ +        'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/', +        'only_matching': True, +    }, { +        'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        domain, display_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, display_id) +        uri = self._search_regex( +            r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid') +        video_id = self._id_from_uri(uri) +        config = self._download_json( +            'http://media.mtvnservices.com/pmt/e1/access/index.html', +            video_id, query={ +                'uri': uri, +                'configtype': 'edge', +            }, headers={ +                'Referer': url, +            }) +        info_url = self._remove_template_parameter(config['feedWithQueryParams']) +        if info_url == 'None': +            if domain.startswith('www.'): +                domain = domain[4:] +            content_domain = { +                'mundonick.uol': 'mundonick.com.br', +                'nickjr': 'br.nickelodeonjunior.tv', +            }[domain] +            query = { +                'mgid': uri, +                'imageEp': content_domain, +                'arcEp': content_domain, +            } +            if domain == 'nickjr.com.br': +                query['ep'] = 'c4b16088' +            info_url = update_url_query( +                'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query) +        return self._get_videos_info_from_url(info_url, video_id)  class NickDeIE(MTVServicesInfoExtractor): diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py deleted file mode 100644 index e43b37136..000000000 --- a/youtube_dl/extractor/nowtv.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( -    ExtractorError, -    determine_ext, -    int_or_none, -    parse_iso8601, -    parse_duration, -    remove_start, -) - - -class NowTVBaseIE(InfoExtractor): -    _VIDEO_FIELDS = ( -        'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', -        'broadcastStartDate', 'seoUrl', 'duration', 'files', -        'format.defaultImage169Format', 'format.defaultImage169Logo') - -    def _extract_video(self, info, display_id=None): -        video_id = compat_str(info['id']) - -        files = info['files'] -        if not files: -            if info.get('geoblocked', False): -                raise ExtractorError( -                    'Video %s is not available from your location due to geo restriction' % video_id, -                    expected=True) -            if not info.get('free', True): -                raise ExtractorError( -                    'Video %s is not available for free' % video_id, expected=True) - -        formats = [] -        for item in files['items']: -            if determine_ext(item['path']) != 'f4v': -                continue -            app, play_path = remove_start(item['path'], '/').split('/', 1) -            formats.append({ -                'url': 'rtmpe://fms.rtl.de', -                'app': app, -                'play_path': 'mp4:%s' % play_path, -                'ext': 'flv', -                'page_url': 'http://rtlnow.rtl.de', -                'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', -                'tbr': int_or_none(item.get('bitrate')), -            }) -        self._sort_formats(formats) - -        title = info['title'] -        description = info.get('articleLong') or info.get('articleShort') -        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') -        duration = parse_duration(info.get('duration')) - -        f = info.get('format', {}) -        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') - -        return { -            'id': video_id, -            'display_id': display_id or info.get('seoUrl'), -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -            'formats': formats, -        } - - -class NowTVIE(NowTVBaseIE): -    _WORKING = False -    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' - -    _TESTS = [{ -        # rtl -        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', -        'info_dict': { -            'id': '203519', -            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', -            'ext': 'flv', -            'title': 'Inka Bause stellt die neuen Bauern vor', -            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1432580700, -            'upload_date': '20150525', -            'duration': 2786, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        # rtl2 -        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', -        'info_dict': { -            'id': '203481', -            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', -            'ext': 'flv', -            'title': 'Berlin - Tag & Nacht (Folge 934)', -            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1432666800, -            'upload_date': '20150526', -            'duration': 2641, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        # rtlnitro -        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', -        'info_dict': { -            'id': '165780', -            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', -            'ext': 'flv', -            'title': 'Hals- und Beinbruch', -            'description': 'md5:b50d248efffe244e6f56737f0911ca57', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1432415400, -            'upload_date': '20150523', -            'duration': 2742, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        # superrtl -        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', -        'info_dict': { -            'id': '99205', -            'display_id': 'medicopter-117/angst', -            'ext': 'flv', -            'title': 'Angst!', -            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1222632900, -            'upload_date': '20080928', -            'duration': 3025, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        # ntv -        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', -        'info_dict': { -            'id': '203521', -            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', -            'ext': 'flv', -            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', -            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1432751700, -            'upload_date': '20150527', -            'duration': 1083, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        # vox -        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', -        'info_dict': { -            'id': '128953', -            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', -            'ext': 'flv', -            'title': "Büro-Fall / Chihuahua 'Joel'", -            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', -            'thumbnail': r're:^https?://.*\.jpg$', -            'timestamp': 1432408200, -            'upload_date': '20150523', -            'duration': 3092, -        }, -        'params': { -            # rtmp download -            'skip_download': True, -        }, -    }, { -        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', -        'only_matching': True, -    }, { -        'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', -        'only_matching': True, -    }, { -        'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', -        'only_matching': True, -    }, { -        'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id')) - -        info = self._download_json( -            'https://api.nowtv.de/v3/movies/%s?fields=%s' -            % (display_id, ','.join(self._VIDEO_FIELDS)), display_id) - -        return self._extract_video(info, display_id) - - -class NowTVListIE(NowTVBaseIE): -    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/list/(?P<id>[^?/#&]+)$' - -    _SHOW_FIELDS = ('title', ) -    _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) - -    _TESTS = [{ -        'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell', -        'info_dict': { -            'id': '17006', -            'title': 'stern TV - Aktuell', -        }, -        'playlist_count': 1, -    }, { -        'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8', -        'info_dict': { -            'id': '20716', -            'title': 'Das Supertalent - FREE Staffel 8', -        }, -        'playlist_count': 14, -    }] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        show_id = mobj.group('show_id') -        season_id = mobj.group('id') - -        fields = [] -        fields.extend(self._SHOW_FIELDS) -        fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) -        fields.extend( -            'formatTabs.formatTabPages.container.movies.%s' % field -            for field in self._VIDEO_FIELDS) - -        list_info = self._download_json( -            'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php' -            % (','.join(fields), show_id), -            season_id) - -        season = next( -            season for season in list_info['formatTabs']['items'] -            if season.get('seoheadline') == season_id) - -        title = '%s - %s' % (list_info['title'], season['headline']) - -        entries = [] -        for container in season['formatTabPages']['items']: -            for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: -                entries.append(self._extract_video(info)) - -        return self.playlist_result( -            entries, compat_str(season.get('id') or season_id), title) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a99af12a4..d1eb3be25 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -112,6 +112,8 @@ class PhantomJSwrapper(object):          return get_exe_version('phantomjs', version_re=r'([0-9.]+)')      def __init__(self, extractor, required_version=None, timeout=10000): +        self._TMP_FILES = {} +          self.exe = check_executable('phantomjs', ['-v'])          if not self.exe:              raise ExtractorError('PhantomJS executable not found in PATH, ' @@ -130,7 +132,6 @@ class PhantomJSwrapper(object):          self.options = {              'timeout': timeout,          } -        self._TMP_FILES = {}          for name in self._TMP_FILE_NAMES:              tmp = tempfile.NamedTemporaryFile(delete=False)              tmp.close() @@ -140,7 +141,7 @@ class PhantomJSwrapper(object):          for name in self._TMP_FILE_NAMES:              try:                  os.remove(self._TMP_FILES[name].name) -            except (IOError, OSError): +            except (IOError, OSError, KeyError):                  pass      def _save_cookies(self, url): @@ -242,7 +243,7 @@ class PhantomJSwrapper(object):  class OpenloadIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.tv)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' +    _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'      _TESTS = [{          'url': 'https://openload.co/f/kUEfGclsU9o', @@ -289,6 +290,9 @@ class OpenloadIE(InfoExtractor):      }, {          'url': 'http://www.openload.link/f/KnG-kKZdcfY',          'only_matching': True, +    }, { +        'url': 'https://oload.stream/f/KnG-kKZdcfY', +        'only_matching': True,      }]      _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 4bf0aa786..597b11218 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -131,6 +131,13 @@ class PluralsightIE(PluralsightBaseIE):              if BLOCKED in response:                  raise ExtractorError(                      'Unable to login: %s' % BLOCKED, expected=True) +            MUST_AGREE = 'To continue using Pluralsight, you must agree to' +            if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): +                raise ExtractorError( +                    'Unable to login: %s some documents. Go to pluralsight.com, ' +                    'log in and agree with what Pluralsight requires.' +                    % MUST_AGREE, expected=True) +              raise ExtractorError('Unable to log in')      def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 8218c7d3b..60ade06da 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -77,12 +77,14 @@ class PornComIE(InfoExtractor):          self._sort_formats(formats)          view_count = str_to_int(self._search_regex( -            r'class=["\']views["\'][^>]*><p>([\d,.]+)', webpage, +            (r'Views:\s*</span>\s*<span>\s*([\d,.]+)', +             r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage,              'view count', fatal=False))          def extract_list(kind):              s = self._search_regex( -                r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize(), +                (r'(?s)%s:\s*</span>\s*<span>(.+?)</span>' % kind.capitalize(), +                 r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize()),                  webpage, kind, fatal=False)              return re.findall(r'<a[^>]+>([^<]+)</a>', s or '') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 5bf64a56b..d22311031 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,6 +17,7 @@ from ..utils import (      parse_duration,      strip_or_none,      try_get, +    unescapeHTML,      unified_strdate,      unified_timestamp,      update_url_query, @@ -249,6 +250,41 @@ class RaiPlayLiveIE(RaiBaseIE):          } +class RaiPlayPlaylistIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', +        'info_dict': { +            'id': 'nondirloalmiocapo', +            'title': 'Non dirlo al mio capo', +            'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', +        }, +        'playlist_mincount': 12, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        title = self._html_search_meta( +            ('programma', 'nomeProgramma'), webpage, 'title') +        description = unescapeHTML(self._html_search_meta( +            ('description', 'og:description'), webpage, 'description')) +        print(description) + +        entries = [] +        for mobj in re.finditer( +                r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', +                webpage): +            video_url = urljoin(url, mobj.group('path')) +            entries.append(self.url_result( +                video_url, ie=RaiPlayIE.ie_key(), +                video_id=RaiPlayIE._match_id(video_url))) + +        return self.playlist_result(entries, playlist_id, title, description) + +  class RaiIE(RaiBaseIE):      _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE      _TESTS = [{ diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py index b446a02ba..4023aeef8 100644 --- a/youtube_dl/extractor/scrippsnetworks.py +++ b/youtube_dl/extractor/scrippsnetworks.py @@ -1,13 +1,11 @@  # coding: utf-8  from __future__ import unicode_literals -import datetime  import json  import hashlib -import hmac  import re -from .common import InfoExtractor +from .aws import AWSIE  from .anvato import AnvatoIE  from ..utils import (      smuggle_url, @@ -16,7 +14,7 @@ from ..utils import (  ) -class ScrippsNetworksWatchIE(InfoExtractor): +class ScrippsNetworksWatchIE(AWSIE):      IE_NAME = 'scrippsnetworks:watch'      _VALID_URL = r'''(?x)                      https?:// @@ -64,44 +62,27 @@ class ScrippsNetworksWatchIE(InfoExtractor):          'travelchannel': 'trav',          'geniuskitchen': 'genius',      } -    _SNI_HOST = 'web.api.video.snidigital.com' -    _AWS_REGION = 'us-east-1' -    _AWS_IDENTITY_ID_JSON = json.dumps({ -        'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % _AWS_REGION -    }) -    _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback'      _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' -    _AWS_SERVICE = 'execute-api' -    _AWS_REQUEST = 'aws4_request' -    _AWS_SIGNED_HEADERS = ';'.join([ -        'host', 'x-amz-date', 'x-amz-security-token', 'x-api-key']) -    _AWS_CANONICAL_REQUEST_TEMPLATE = '''GET -%(uri)s - -host:%(host)s -x-amz-date:%(date)s -x-amz-security-token:%(token)s -x-api-key:%(key)s +    _AWS_PROXY_HOST = 'web.api.video.snidigital.com' -%(signed_headers)s -%(payload_hash)s''' +    _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback'      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          site_id, video_id = mobj.group('site', 'id') -        def aws_hash(s): -            return hashlib.sha256(s.encode('utf-8')).hexdigest() - +        aws_identity_id_json = json.dumps({ +            'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION +        }).encode('utf-8')          token = self._download_json( -            'https://cognito-identity.us-east-1.amazonaws.com/', video_id, -            data=self._AWS_IDENTITY_ID_JSON.encode('utf-8'), +            'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, +            data=aws_identity_id_json,              headers={                  'Accept': '*/*',                  'Content-Type': 'application/x-amz-json-1.1',                  'Referer': url, -                'X-Amz-Content-Sha256': aws_hash(self._AWS_IDENTITY_ID_JSON), +                'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(),                  'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken',                  'X-Amz-User-Agent': self._AWS_USER_AGENT,              })['Token'] @@ -124,64 +105,12 @@ x-api-key:%(key)s                  sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key,                  fatal=True) -        access_key_id = get('AccessKeyId') -        secret_access_key = get('SecretAccessKey') -        session_token = get('SessionToken') - -        # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html -        uri = '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id) -        datetime_now = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') -        date = datetime_now[:8] -        canonical_string = self._AWS_CANONICAL_REQUEST_TEMPLATE % { -            'uri': uri, -            'host': self._SNI_HOST, -            'date': datetime_now, -            'token': session_token, -            'key': self._AWS_API_KEY, -            'signed_headers': self._AWS_SIGNED_HEADERS, -            'payload_hash': aws_hash(''), -        } - -        # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html -        credential_string = '/'.join([date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]) -        string_to_sign = '\n'.join([ -            'AWS4-HMAC-SHA256', datetime_now, credential_string, -            aws_hash(canonical_string)]) - -        # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html -        def aws_hmac(key, msg): -            return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) - -        def aws_hmac_digest(key, msg): -            return aws_hmac(key, msg).digest() - -        def aws_hmac_hexdigest(key, msg): -            return aws_hmac(key, msg).hexdigest() - -        k_secret = 'AWS4' + secret_access_key -        k_date = aws_hmac_digest(k_secret.encode('utf-8'), date) -        k_region = aws_hmac_digest(k_date, self._AWS_REGION) -        k_service = aws_hmac_digest(k_region, self._AWS_SERVICE) -        k_signing = aws_hmac_digest(k_service, self._AWS_REQUEST) - -        signature = aws_hmac_hexdigest(k_signing, string_to_sign) - -        auth_header = ', '.join([ -            'AWS4-HMAC-SHA256 Credential=%s' % '/'.join( -                [access_key_id, date, self._AWS_REGION, self._AWS_SERVICE, self._AWS_REQUEST]), -            'SignedHeaders=%s' % self._AWS_SIGNED_HEADERS, -            'Signature=%s' % signature, -        ]) - -        mcp_id = self._download_json( -            'https://%s%s' % (self._SNI_HOST, uri), video_id, headers={ -                'Accept': '*/*', -                'Referer': url, -                'Authorization': auth_header, -                'X-Amz-Date': datetime_now, -                'X-Amz-Security-Token': session_token, -                'X-Api-Key': self._AWS_API_KEY, -            })['results'][0]['mcpId'] +        mcp_id = self._aws_execute_api({ +            'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), +            'access_key': get('AccessKeyId'), +            'secret_key': get('SecretAccessKey'), +            'session_token': get('SessionToken'), +        }, video_id)['results'][0]['mcpId']          return self.url_result(              smuggle_url( diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py new file mode 100644 index 000000000..9792f820a --- /dev/null +++ b/youtube_dl/extractor/sevenplus.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..utils import update_url_query + + +class SevenPlusIE(BrightcoveNewIE): +    IE_NAME = '7plus' +    _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' +    _TESTS = [{ +        'url': 'https://7plus.com.au/BEAT?episode-id=BEAT-001', +        'info_dict': { +            'id': 'BEAT-001', +            'ext': 'mp4', +            'title': 'S1 E1 - Help / Lucy In The Sky With Diamonds', +            'description': 'md5:37718bea20a8eedaca7f7361af566131', +            'uploader_id': '5303576322001', +            'upload_date': '20171031', +            'timestamp': 1509440068, +        }, +        'params': { +            'format': 'bestvideo', +            'skip_download': True, +        } +    }, { +        'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        path, episode_id = re.match(self._VALID_URL, url).groups() + +        media = self._download_json( +            'https://videoservice.swm.digital/playback', episode_id, query={ +                'appId': '7plus', +                'deviceType': 'web', +                'platformType': 'web', +                'accountId': 5303576322001, +                'referenceId': 'ref:' + episode_id, +                'deliveryId': 'csai', +                'videoType': 'vod', +            })['media'] + +        for source in media.get('sources', {}): +            src = source.get('src') +            if not src: +                continue +            source['src'] = update_url_query(src, {'rule': ''}) + +        info = self._parse_brightcove_metadata(media, episode_id) + +        content = self._download_json( +            'https://component-cdn.swm.digital/content/' + path, +            episode_id, headers={ +                'market-id': 4, +            }, fatal=False) or {} +        for item in content.get('items', {}): +            if item.get('componentData', {}).get('componentType') == 'infoPanel': +                for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: +                    value = item.get(src_key) +                    if value: +                        info[dst_key] = value + +        return info diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 374f7faf9..5c2a6206b 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,22 +1,53 @@  # coding: utf-8  from __future__ import unicode_literals -import re  import json +import math +import re -from .common import InfoExtractor +from .aws import AWSIE  from ..compat import compat_HTTPError  from ..utils import ( +    clean_html,      ExtractorError, +    InAdvancePagedList,      int_or_none,      parse_iso8601,      str_or_none,      urlencode_postdata, -    clean_html,  ) -class ShahidIE(InfoExtractor): +class ShahidBaseIE(AWSIE): +    _AWS_PROXY_HOST = 'api2.shahid.net' +    _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + +    def _handle_error(self, e): +        fail_data = self._parse_json( +            e.cause.read().decode('utf-8'), None, fatal=False) +        if fail_data: +            faults = fail_data.get('faults', []) +            faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) +            if faults_message: +                raise ExtractorError(faults_message, expected=True) + +    def _call_api(self, path, video_id, request=None): +        query = {} +        if request: +            query['request'] = json.dumps(request) +        try: +            return self._aws_execute_api({ +                'uri': '/proxy/v2/' + path, +                'access_key': 'AKIAI6X4TYCIXM2B7MUQ', +                'secret_key': '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', +            }, video_id, query) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError): +                self._handle_error(e) +            raise + + +class ShahidIE(ShahidBaseIE):      _NETRC_MACHINE = 'shahid'      _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'      _TESTS = [{ @@ -41,34 +72,25 @@ class ShahidIE(InfoExtractor):          'only_matching': True      }] -    def _api2_request(self, *args, **kwargs): -        try: -            return self._download_json(*args, **kwargs) -        except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError): -                fail_data = self._parse_json( -                    e.cause.read().decode('utf-8'), None, fatal=False) -                if fail_data: -                    faults = fail_data.get('faults', []) -                    faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) -                    if faults_message: -                        raise ExtractorError(faults_message, expected=True) -            raise -      def _real_initialize(self):          email, password = self._get_login_info()          if email is None:              return -        user_data = self._api2_request( -            'https://shahid.mbc.net/wd/service/users/login', -            None, 'Logging in', data=json.dumps({ -                'email': email, -                'password': password, -                'basic': 'false', -            }).encode('utf-8'), headers={ -                'Content-Type': 'application/json; charset=UTF-8', -            })['user'] +        try: +            user_data = self._download_json( +                'https://shahid.mbc.net/wd/service/users/login', +                None, 'Logging in', data=json.dumps({ +                    'email': email, +                    'password': password, +                    'basic': 'false', +                }).encode('utf-8'), headers={ +                    'Content-Type': 'application/json; charset=UTF-8', +                })['user'] +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError): +                self._handle_error(e) +            raise          self._download_webpage(              'https://shahid.mbc.net/populateContext', @@ -81,25 +103,13 @@ class ShahidIE(InfoExtractor):                  'sessionId': user_data['sessionId'],              })) -    def _get_api_data(self, response): -        data = response.get('data', {}) - -        error = data.get('error') -        if error: -            raise ExtractorError( -                '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), -                expected=True) - -        return data -      def _real_extract(self, url):          page_type, video_id = re.match(self._VALID_URL, url).groups()          if page_type == 'clip':              page_type = 'episode' -        playout = self._api2_request( -            'https://api2.shahid.net/proxy/v2/playout/url/' + video_id, -            video_id, 'Downloading player JSON')['playout'] +        playout = self._call_api( +            'playout/url/' + video_id, video_id)['playout']          if playout.get('drm'):              raise ExtractorError('This video is DRM protected.', expected=True) @@ -107,13 +117,27 @@ class ShahidIE(InfoExtractor):          formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4')          self._sort_formats(formats) -        video = self._get_api_data(self._download_json( +        # video = self._call_api( +        #     'product/id', video_id, { +        #         'id': video_id, +        #         'productType': 'ASSET', +        #         'productSubType': page_type.upper() +        #     })['productModel'] + +        response = self._download_json(              'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id),              video_id, 'Downloading video JSON', query={                  'apiKey': 'sh@hid0nlin3',                  'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', -            }))[page_type] +            }) +        data = response.get('data', {}) +        error = data.get('error') +        if error: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), +                expected=True) +        video = data[page_type]          title = video['title']          categories = [              category['name'] @@ -135,3 +159,57 @@ class ShahidIE(InfoExtractor):              'episode_id': video_id,              'formats': formats,          } + + +class ShahidShowIE(ShahidBaseIE): +    _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', +        'info_dict': { +            'id': '79187', +            'title': 'رامز قرش البحر', +            'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', +        }, +        'playlist_mincount': 32, +    }, { +        'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', +        'only_matching': True +    }] +    _PAGE_SIZE = 30 + +    def _real_extract(self, url): +        show_id = self._match_id(url) + +        product = self._call_api( +            'playableAsset', show_id, {'showId': show_id})['productModel'] +        playlist = product['playlist'] +        playlist_id = playlist['id'] +        show = product.get('show', {}) + +        def page_func(page_num): +            playlist = self._call_api( +                'product/playlist', show_id, { +                    'playListId': playlist_id, +                    'pageNumber': page_num, +                    'pageSize': 30, +                    'sorts': [{ +                        'order': 'DESC', +                        'type': 'SORTDATE' +                    }], +                }) +            for product in playlist.get('productList', {}).get('products', []): +                product_url = product.get('productUrl', []).get('url') +                if not product_url: +                    continue +                yield self.url_result( +                    product_url, 'Shahid', +                    str_or_none(product.get('id')), +                    product.get('title')) + +        entries = InAdvancePagedList( +            page_func, +            math.ceil(playlist['count'] / self._PAGE_SIZE), +            self._PAGE_SIZE) + +        return self.playlist_result( +            entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py index accd112aa..c3078e285 100644 --- a/youtube_dl/extractor/sonyliv.py +++ b/youtube_dl/extractor/sonyliv.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import smuggle_url  class SonyLIVIE(InfoExtractor): @@ -10,12 +11,12 @@ class SonyLIVIE(InfoExtractor):          'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",          'info_dict': {              'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", -            'id': '5024612095001', +            'id': 'ref:5024612095001',              'ext': 'mp4', -            'upload_date': '20160707', +            'upload_date': '20170923',              'description': 'md5:7f28509a148d5be9d0782b4d5106410d', -            'uploader_id': '4338955589001', -            'timestamp': 1467870968, +            'uploader_id': '5182475815001', +            'timestamp': 1506200547,          },          'params': {              'skip_download': True, @@ -26,9 +27,11 @@ class SonyLIVIE(InfoExtractor):          'only_matching': True,      }] -    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' +    # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s'      def _real_extract(self, url):          brightcove_id = self._match_id(url)          return self.url_result( -            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) +            smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['IN']}), +            'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py new file mode 100644 index 000000000..ae2ac1b42 --- /dev/null +++ b/youtube_dl/extractor/stretchinternet.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class StretchInternetIE(InfoExtractor): +    _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P<id>\d+)' +    _TEST = { +        'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', +        'info_dict': { +            'id': '313900', +            'ext': 'mp4', +            'title': 'Augustana (S.D.) Baseball vs University of Mary', +            'description': 'md5:7578478614aae3bdd4a90f578f787438', +            'timestamp': 1490468400, +            'upload_date': '20170325', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        stream = self._download_json( +            'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' +            % video_id, video_id) + +        video_url = 'https://%s' % stream['source'] + +        event = self._download_json( +            'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', +            video_id, query={ +                'clientID': 99997, +                'eventID': video_id, +                'token': 'asdf', +            })['event'] + +        title = event.get('title') or event['mobileTitle'] +        description = event.get('customText') +        timestamp = int_or_none(event.get('longtime')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'timestamp': timestamp, +            'url': video_url, +        } diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index e9474533f..eab22c38f 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,58 +4,109 @@ from __future__ import unicode_literals  import re  from .turner import TurnerBaseIE -from ..utils import extract_attributes +from ..utils import ( +    float_or_none, +    int_or_none, +    strip_or_none, +)  class TBSIE(TurnerBaseIE): -    # https://github.com/rg3/youtube-dl/issues/13658 -    _WORKING = False - -    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' +    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)'      _TESTS = [{ -        'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', -        'md5': '9e61d680e2285066ade7199e6408b2ee', +        'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster',          'info_dict': { -            'id': '2007318', +            'id': '8d384cde33b89f3a43ce5329de42903ed5099887',              'ext': 'mp4', -            'title': 'Theatrical Trailer', -            'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', +            'title': 'Monster', +            'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.', +            'timestamp': 1508175329, +            'upload_date': '20171016',          }, -        'skip': 'TBS videos are deleted after a while', +        'params': { +            # m3u8 download +            'skip_download': True, +        }      }, { -        'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', -        'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', -        'info_dict': { -            'id': '1538823', -            'ext': 'mp4', -            'title': 'You Better Run', -            'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', -        }, -        'skip': 'TBS videos are deleted after a while', +        'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew', +        'only_matching': True, +    }, { +        'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', +        'only_matching': True,      }]      def _real_extract(self, url): -        domain, display_id = re.match(self._VALID_URL, url).groups() -        site = domain[:3] +        site, display_id = re.match(self._VALID_URL, url).groups()          webpage = self._download_webpage(url, display_id) -        video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) -        query = None -        clip_id = video_params.get('clipid') -        if clip_id: -            query = 'id=' + clip_id -        else: -            query = 'titleId=' + video_params['titleid'] -        return self._extract_cvp_info( -            'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { -                'default': { -                    'media_src': 'http://ht.cdn.turner.com/%s/big' % site, -                }, -                'secure': { -                    'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site, -                    'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, -                }, -            }, { -                'url': url, -                'site_name': site.upper(), -                'auth_required': video_params.get('isAuthRequired') != 'false', -            }) +        video_data = self._parse_json(self._search_regex( +            r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', +            webpage, 'drupal setting'), display_id)['turner_playlist'][0] + +        media_id = video_data['mediaID'] +        title = video_data['title'] + +        streams_data = self._download_json( +            'http://medium.ngtv.io/media/%s/tv' % media_id, +            media_id)['media']['tv'] +        duration = None +        chapters = [] +        formats = [] +        for supported_type in ('unprotected', 'bulkaes'): +            stream_data = streams_data.get(supported_type, {}) +            m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') +            if not m3u8_url: +                continue +            if stream_data.get('playlistProtection') == 'spe': +                m3u8_url = self._add_akamai_spe_token( +                    'http://www.%s.com/service/token_spe' % site, +                    m3u8_url, media_id, { +                        'url': url, +                        'site_name': site[:3].upper(), +                        'auth_required': video_data.get('authRequired') == '1', +                    }) +            formats.extend(self._extract_m3u8_formats( +                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + +            duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) + +            if not chapters: +                for chapter in stream_data.get('contentSegments', []): +                    start_time = float_or_none(chapter.get('start')) +                    duration = float_or_none(chapter.get('duration')) +                    if start_time is None or duration is None: +                        continue +                    chapters.append({ +                        'start_time': start_time, +                        'end_time': start_time + duration, +                    }) +        self._sort_formats(formats) + +        thumbnails = [] +        for image_id, image in video_data.get('images', {}).items(): +            image_url = image.get('url') +            if not image_url or image.get('type') != 'video': +                continue +            i = { +                'id': image_id, +                'url': image_url, +            } +            mobj = re.search(r'(\d+)x(\d+)', image_url) +            if mobj: +                i.update({ +                    'width': int(mobj.group(1)), +                    'height': int(mobj.group(2)), +                }) +            thumbnails.append(i) + +        return { +            'id': media_id, +            'title': title, +            'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), +            'duration': duration, +            'timestamp': int_or_none(video_data.get('created')), +            'season_number': int_or_none(video_data.get('season')), +            'episode_number': int_or_none(video_data.get('episode')), +            'cahpters': chapters, +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 17c0adc15..2e7876cc5 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -16,7 +16,7 @@ from ..utils import (  class TouTvIE(InfoExtractor):      _NETRC_MACHINE = 'toutv'      IE_NAME = 'tou.tv' -    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+E[0-9]+)?)' +    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)'      _access_token = None      _claims = None @@ -37,6 +37,9 @@ class TouTvIE(InfoExtractor):      }, {          'url': 'http://ici.tou.tv/hackers',          'only_matching': True, +    }, { +        'url': 'https://ici.tou.tv/l-age-adulte/S01C501', +        'only_matching': True,      }]      def _real_initialize(self): diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index efeb677ee..e73b64aeb 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -18,9 +18,32 @@ from ..utils import (  class TurnerBaseIE(AdobePassIE): +    _AKAMAI_SPE_TOKEN_CACHE = {} +      def _extract_timestamp(self, video_data):          return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) +    def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): +        secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' +        token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) +        if not token: +            query = { +                'path': secure_path, +                'videoId': content_id, +            } +            if ap_data.get('auth_required'): +                query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) +            auth = self._download_xml( +                tokenizer_src, content_id, query=query) +            error_msg = xpath_text(auth, 'error/msg') +            if error_msg: +                raise ExtractorError(error_msg, expected=True) +            token = xpath_text(auth, 'token') +            if not token: +                return video_url +            self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token +        return video_url + '?hdnea=' + token +      def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):          video_data = self._download_xml(data_src, video_id)          video_id = video_data.attrib['id'] @@ -33,7 +56,6 @@ class TurnerBaseIE(AdobePassIE):          #         rtmp_src = splited_rtmp_src[1]          # aifp = xpath_text(video_data, 'akamai/aifp', default='') -        tokens = {}          urls = []          formats = []          rex = re.compile( @@ -67,26 +89,10 @@ class TurnerBaseIE(AdobePassIE):                  secure_path_data = path_data.get('secure')                  if not secure_path_data:                      continue -                video_url = secure_path_data['media_src'] + video_url -                secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' -                token = tokens.get(secure_path) -                if not token: -                    query = { -                        'path': secure_path, -                        'videoId': content_id, -                    } -                    if ap_data.get('auth_required'): -                        query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], video_id, ap_data['site_name'], ap_data['site_name']) -                    auth = self._download_xml( -                        secure_path_data['tokenizer_src'], video_id, query=query) -                    error_msg = xpath_text(auth, 'error/msg') -                    if error_msg: -                        raise ExtractorError(error_msg, expected=True) -                    token = xpath_text(auth, 'token') -                    if not token: -                        continue -                    tokens[secure_path] = token -                video_url = video_url + '?hdnea=' + token +                video_url = self._add_akamai_spe_token( +                    secure_path_data['tokenizer_src'], +                    secure_path_data['media_src'] + video_url, +                    content_id, ap_data)              elif not re.match('https?://', video_url):                  base_path_data = path_data.get(ext, path_data.get('default', {}))                  media_src = base_path_data.get('media_src') diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py new file mode 100644 index 000000000..e2169f2bc --- /dev/null +++ b/youtube_dl/extractor/tvnow.py @@ -0,0 +1,175 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    ExtractorError, +    parse_iso8601, +    parse_duration, +    update_url_query, +) + + +class TVNowBaseIE(InfoExtractor): +    _VIDEO_FIELDS = ( +        'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', +        'broadcastStartDate', 'isDrm', 'duration', 'manifest.dashclear', +        'format.defaultImage169Format', 'format.defaultImage169Logo') + +    def _call_api(self, path, video_id, query): +        return self._download_json( +            'https://api.tvnow.de/v3/' + path, +            video_id, query=query) + +    def _extract_video(self, info, display_id): +        video_id = compat_str(info['id']) +        title = info['title'] + +        mpd_url = info['manifest']['dashclear'] +        if not mpd_url: +            if info.get('isDrm'): +                raise ExtractorError( +                    'Video %s is DRM protected' % video_id, expected=True) +            if info.get('geoblocked'): +                raise ExtractorError( +                    'Video %s is not available from your location due to geo restriction' % video_id, +                    expected=True) +            if not info.get('free', True): +                raise ExtractorError( +                    'Video %s is not available for free' % video_id, expected=True) + +        mpd_url = update_url_query(mpd_url, {'filter': ''}) +        formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False) +        formats.extend(self._extract_ism_formats( +            mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'), +            video_id, ism_id='mss', fatal=False)) +        formats.extend(self._extract_m3u8_formats( +            mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'), +            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +        self._sort_formats(formats) + +        description = info.get('articleLong') or info.get('articleShort') +        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') +        duration = parse_duration(info.get('duration')) + +        f = info.get('format', {}) +        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +        } + + +class TVNowIE(TVNowBaseIE): +    _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' + +    _TESTS = [{ +        # rtl +        'url': 'https://www.tvnow.de/rtl/alarm-fuer-cobra-11/freier-fall/player?return=/rtl', +        'info_dict': { +            'id': '385314', +            'display_id': 'alarm-fuer-cobra-11/freier-fall', +            'ext': 'mp4', +            'title': 'Freier Fall', +            'description': 'md5:8c2d8f727261adf7e0dc18366124ca02', +            'thumbnail': r're:^https?://.*\.jpg$', +            'timestamp': 1512677700, +            'upload_date': '20171207', +            'duration': 2862.0, +        }, +    }, { +        # rtl2 +        'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', +        'only_matching': 'True', +    }, { +        # rtlnitro +        'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', +        'only_matching': 'True', +    }, { +        # superrtl +        'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', +        'only_matching': 'True', +    }, { +        # ntv +        'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', +        'only_matching': 'True', +    }, { +        # vox +        'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', +        'only_matching': 'True', +    }, { +        # rtlplus +        'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', +        'only_matching': 'True', +    }] + +    def _real_extract(self, url): +        display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() + +        info = self._call_api( +            'movies/' + display_id, display_id, query={ +                'fields': ','.join(self._VIDEO_FIELDS), +            }) + +        return self._extract_video(info, display_id) + + +class TVNowListIE(TVNowBaseIE): +    _VALID_URL = r'(?P<base_url>https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/)list/(?P<id>[^?/#&]+)$' + +    _SHOW_FIELDS = ('title', ) +    _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) +    _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', ) + +    _TESTS = [{ +        'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell', +        'info_dict': { +            'id': '28296', +            'title': '30 Minuten Deutschland - Aktuell', +        }, +        'playlist_mincount': 1, +    }] + +    def _real_extract(self, url): +        base_url, show_id, season_id = re.match(self._VALID_URL, url).groups() + +        fields = [] +        fields.extend(self._SHOW_FIELDS) +        fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) +        fields.extend( +            'formatTabs.formatTabPages.container.movies.%s' % field +            for field in self._VIDEO_FIELDS) + +        list_info = self._call_api( +            'formats/seo', season_id, query={ +                'fields': ','.join(fields), +                'name': show_id + '.php' +            }) + +        season = next( +            season for season in list_info['formatTabs']['items'] +            if season.get('seoheadline') == season_id) + +        title = '%s - %s' % (list_info['title'], season['headline']) + +        entries = [] +        for container in season['formatTabPages']['items']: +            for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: +                seo_url = info.get('seoUrl') +                if not seo_url: +                    continue +                entries.append(self.url_result( +                    base_url + seo_url + '/player', 'TVNow', info.get('id'))) + +        return self.playlist_result( +            entries, compat_str(season.get('id') or season_id), title) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1b0b96371..d7e425041 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -43,7 +43,7 @@ class TwitterBaseIE(InfoExtractor):  class TwitterCardIE(TwitterBaseIE):      IE_NAME = 'twitter:card' -    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'      _TESTS = [          {              'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,11 +51,10 @@ class TwitterCardIE(TwitterBaseIE):              'info_dict': {                  'id': '560070183650213889',                  'ext': 'mp4', -                'title': 'Twitter Card', +                'title': 'Twitter web player',                  'thumbnail': r're:^https?://.*\.jpg$',                  'duration': 30.033,              }, -            'skip': 'Video gone',          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -63,11 +62,9 @@ class TwitterCardIE(TwitterBaseIE):              'info_dict': {                  'id': '623160978427936768',                  'ext': 'mp4', -                'title': 'Twitter Card', -                'thumbnail': r're:^https?://.*\.jpg', -                'duration': 80.155, +                'title': 'Twitter web player', +                'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg',              }, -            'skip': 'Video gone',          },          {              'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', @@ -120,15 +117,15 @@ class TwitterCardIE(TwitterBaseIE):              elif media_url.endswith('.mpd'):                  formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))              else: -                vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) +                tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)                  a_format = {                      'url': media_url, -                    'format_id': 'http-%d' % vbr if vbr else 'http', -                    'vbr': vbr, +                    'format_id': 'http-%d' % tbr if tbr else 'http', +                    'tbr': tbr,                  }                  # Reported bitRate may be zero -                if not a_format['vbr']: -                    del a_format['vbr'] +                if not a_format['tbr']: +                    del a_format['tbr']                  self._search_dimensions_in_video_url(a_format, media_url) @@ -150,79 +147,83 @@ class TwitterCardIE(TwitterBaseIE):          bearer_token = self._search_regex(              r'BEARER_TOKEN\s*:\s*"([^"]+)"',              main_script, 'bearer token') -        guest_token = self._search_regex( -            r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)', -            webpage, 'guest token') +        # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id          api_data = self._download_json( -            'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id, -            video_id, 'Downloading mobile API data', +            'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, +            video_id, 'Downloading API data',              headers={                  'Authorization': 'Bearer ' + bearer_token, -                'x-guest-token': guest_token,              }) -        media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id] -                                                  ['extended_entities']['media'][0]['video_info']) or {} +        media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {}          return self._parse_media_info(media_info, video_id)      def _real_extract(self, url): -        video_id = self._match_id(url) +        path, video_id = re.search(self._VALID_URL, url).groups()          config = None          formats = []          duration = None -        webpage = self._download_webpage(url, video_id) +        urls = [url] +        if path.startswith('cards/'): +            urls.append('https://twitter.com/i/videos/' + video_id) -        iframe_url = self._html_search_regex( -            r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', -            webpage, 'video iframe', default=None) -        if iframe_url: -            return self.url_result(iframe_url) +        for u in urls: +            webpage = self._download_webpage(u, video_id) -        config = self._parse_json(self._html_search_regex( -            r'data-(?:player-)?config="([^"]+)"', webpage, -            'data player config', default='{}'), -            video_id) +            iframe_url = self._html_search_regex( +                r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', +                webpage, 'video iframe', default=None) +            if iframe_url: +                return self.url_result(iframe_url) -        if config.get('source_type') == 'vine': -            return self.url_result(config['player_url'], 'Vine') +            config = self._parse_json(self._html_search_regex( +                r'data-(?:player-)?config="([^"]+)"', webpage, +                'data player config', default='{}'), +                video_id) -        periscope_url = PeriscopeIE._extract_url(webpage) -        if periscope_url: -            return self.url_result(periscope_url, PeriscopeIE.ie_key()) +            if config.get('source_type') == 'vine': +                return self.url_result(config['player_url'], 'Vine') -        video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') +            periscope_url = PeriscopeIE._extract_url(webpage) +            if periscope_url: +                return self.url_result(periscope_url, PeriscopeIE.ie_key()) -        if video_url: -            if determine_ext(video_url) == 'm3u8': -                formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) -            else: -                f = { -                    'url': video_url, -                } +            video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + +            if video_url: +                if determine_ext(video_url) == 'm3u8': +                    formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) +                else: +                    f = { +                        'url': video_url, +                    } + +                    self._search_dimensions_in_video_url(f, video_url) -                self._search_dimensions_in_video_url(f, video_url) +                    formats.append(f) -                formats.append(f) +            vmap_url = config.get('vmapUrl') or config.get('vmap_url') +            if vmap_url: +                formats.extend( +                    self._extract_formats_from_vmap_url(vmap_url, video_id)) -        vmap_url = config.get('vmapUrl') or config.get('vmap_url') -        if vmap_url: -            formats.extend( -                self._extract_formats_from_vmap_url(vmap_url, video_id)) +            media_info = None -        media_info = None +            for entity in config.get('status', {}).get('entities', []): +                if 'mediaInfo' in entity: +                    media_info = entity['mediaInfo'] -        for entity in config.get('status', {}).get('entities', []): -            if 'mediaInfo' in entity: -                media_info = entity['mediaInfo'] +            if media_info: +                formats.extend(self._parse_media_info(media_info, video_id)) +                duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) -        if media_info: -            formats.extend(self._parse_media_info(media_info, video_id)) -            duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) +            username = config.get('user', {}).get('screen_name') +            if username: +                formats.extend(self._extract_mobile_formats(username, video_id)) -        username = config.get('user', {}).get('screen_name') -        if username: -            formats.extend(self._extract_mobile_formats(username, video_id)) +            if formats: +                break          self._remove_duplicate_formats(formats)          self._sort_formats(formats) @@ -258,9 +259,6 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'freethenipple',              'duration': 12.922,          }, -        'params': { -            'skip_download': True,  # requires ffmpeg -        },      }, {          'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',          'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -277,7 +275,6 @@ class TwitterIE(InfoExtractor):          'skip': 'Account suspended',      }, {          'url': 'https://twitter.com/starwars/status/665052190608723968', -        'md5': '39b7199856dee6cd4432e72c74bc69d4',          'info_dict': {              'id': '665052190608723968',              'ext': 'mp4', @@ -303,20 +300,16 @@ class TwitterIE(InfoExtractor):          },      }, {          'url': 'https://twitter.com/jaydingeer/status/700207533655363584', -        'md5': '',          'info_dict': {              'id': '700207533655363584',              'ext': 'mp4', -            'title': 'あかさ - BEAT PROD: @suhmeduh #Damndaniel', -            'description': 'あかさ on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', +            'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', +            'description': 'JG on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',              'thumbnail': r're:^https?://.*\.jpg', -            'uploader': 'あかさ', +            'uploader': 'JG',              'uploader_id': 'jaydingeer',              'duration': 30.0,          }, -        'params': { -            'skip_download': True,  # requires ffmpeg -        },      }, {          'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',          'md5': '89a15ed345d13b86e9a5a5e051fa308a', @@ -342,9 +335,6 @@ class TwitterIE(InfoExtractor):              'uploader': 'Captain America',              'duration': 3.17,          }, -        'params': { -            'skip_download': True,  # requires ffmpeg -        },      }, {          'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',          'info_dict': { @@ -370,9 +360,6 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'news_al3alm',              'duration': 277.4,          }, -        'params': { -            'format': 'best[format_id^=http-]', -        },      }, {          'url': 'https://twitter.com/i/web/status/910031516746514432',          'info_dict': { diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index c248ea727..195f5ce78 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -62,11 +62,11 @@ class UdemyIE(InfoExtractor):      def _extract_course_info(self, webpage, video_id):          course = self._parse_json(              unescapeHTML(self._search_regex( -                r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), +                r'ng-init=["\'].*\bcourse=({.+?})[;"\']', +                webpage, 'course', default='{}')),              video_id, fatal=False) or {}          course_id = course.get('id') or self._search_regex( -            (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), -            webpage, 'course id') +            r'data-course-id=["\'](\d+)', webpage, 'course id')          return course_id, course.get('title')      def _enroll_course(self, base_url, webpage, course_id): @@ -257,6 +257,11 @@ class UdemyIE(InfoExtractor):                  video_url = source.get('file') or source.get('src')                  if not video_url or not isinstance(video_url, compat_str):                      continue +                if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        m3u8_id='hls', fatal=False)) +                    continue                  format_id = source.get('label')                  f = {                      'url': video_url, diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py new file mode 100644 index 000000000..d815cd9a6 --- /dev/null +++ b/youtube_dl/extractor/umg.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_filesize, +    parse_iso8601, +) + + +class UMGDeIE(InfoExtractor): +    IE_NAME = 'umg:de' +    IE_DESC = 'Universal Music Deutschland' +    _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P<id>\d+)' +    _TEST = { +        'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803', +        'md5': 'ebd90f48c80dcc82f77251eb1902634f', +        'info_dict': { +            'id': '457803', +            'ext': 'mp4', +            'title': 'Jedes Wort ist Gold wert', +            'timestamp': 1513591800, +            'upload_date': '20171218', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        video_data = self._download_json( +            'https://api.universal-music.de/graphql', +            video_id, query={ +                'query': '''{ +  universalMusic(channel:16) { +    video(id:%s) { +      headline +      formats { +        formatId +        url +        type +        width +        height +        mimeType +        fileSize +      } +      duration +      createdDate +    } +  } +}''' % video_id})['data']['universalMusic']['video'] + +        title = video_data['headline'] +        hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8' + +        thumbnails = [] +        formats = [] + +        def add_m3u8_format(format_id): +            m3u8_formats = self._extract_m3u8_formats( +                hls_url_template % format_id, video_id, 'mp4', +                'm3u8_native', m3u8_id='hls', fatal='False') +            if m3u8_formats and m3u8_formats[0].get('height'): +                formats.extend(m3u8_formats) + +        for f in video_data.get('formats', []): +            f_url = f.get('url') +            mime_type = f.get('mimeType') +            if not f_url or mime_type == 'application/mxf': +                continue +            fmt = { +                'url': f_url, +                'width': int_or_none(f.get('width')), +                'height': int_or_none(f.get('height')), +                'filesize': parse_filesize(f.get('fileSize')), +            } +            f_type = f.get('type') +            if f_type == 'Image': +                thumbnails.append(fmt) +            elif f_type == 'Video': +                format_id = f.get('formatId') +                if format_id: +                    fmt['format_id'] = format_id +                    if mime_type == 'video/mp4': +                        add_m3u8_format(format_id) +                urlh = self._request_webpage(f_url, video_id, fatal=False) +                if urlh: +                    first_byte = urlh.read(1) +                    if first_byte not in (b'F', b'\x00'): +                        continue +                    formats.append(fmt) +        if not formats: +            for format_id in (867, 836, 940): +                add_m3u8_format(format_id) +        self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr')) + +        return { +            'id': video_id, +            'title': title, +            'duration': int_or_none(video_data.get('duration')), +            'timestamp': parse_iso8601(video_data.get('createdDate'), ' '), +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 0d8376522..d4838b3e5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -414,7 +414,7 @@ class VKIE(VKBaseIE):          view_count = str_to_int(self._search_regex(              r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', -            info_page, 'view count', fatal=False)) +            info_page, 'view count', default=None))          formats = []          for format_id, format_url in data.items(): diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index 5de3deb8c..751b21ee5 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from .kaltura import KalturaIE  from ..utils import (      ExtractorError,      int_or_none, @@ -21,7 +20,6 @@ class VootIE(InfoExtractor):              'ext': 'mp4',              'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',              'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', -            'uploader_id': 'batchUser',              'timestamp': 1472162937,              'upload_date': '20160825',              'duration': 1146, @@ -63,6 +61,10 @@ class VootIE(InfoExtractor):          entry_id = media['EntryId']          title = media['MediaName'] +        formats = self._extract_m3u8_formats( +            'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, +            video_id, 'mp4', m3u8_id='hls') +        self._sort_formats(formats)          description, series, season_number, episode, episode_number = [None] * 5 @@ -82,9 +84,8 @@ class VootIE(InfoExtractor):                  episode_number = int_or_none(value)          return { -            '_type': 'url_transparent', -            'url': 'kaltura:1982551:%s' % entry_id, -            'ie_key': KalturaIE.ie_key(), +            'extractor_key': 'Kaltura', +            'id': entry_id,              'title': title,              'description': description,              'series': series, @@ -95,4 +96,5 @@ class VootIE(InfoExtractor):              'duration': int_or_none(media.get('Duration')),              'view_count': int_or_none(media.get('ViewCounter')),              'like_count': int_or_none(media.get('like_counter')), +            'formats': formats,          } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 52f8ded2f..68652a22f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -76,6 +76,10 @@ class XHamsterIE(InfoExtractor):              'skip_download': True,          },      }, { +        # mobile site +        'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111', +        'only_matching': True, +    }, {          'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',          'only_matching': True,      }, { @@ -93,7 +97,8 @@ class XHamsterIE(InfoExtractor):          video_id = mobj.group('id') or mobj.group('id_2')          display_id = mobj.group('display_id') or mobj.group('display_id_2') -        webpage = self._download_webpage(url, video_id) +        desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) +        webpage = self._download_webpage(desktop_url, video_id)          error = self._html_search_regex(              r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -229,8 +234,8 @@ class XHamsterIE(InfoExtractor):              webpage, 'uploader', default='anonymous')          thumbnail = self._search_regex( -            [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', -             r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], +            [r'''["']thumbUrl["']\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', +             r'''<video[^>]+"poster"=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],              webpage, 'thumbnail', fatal=False, group='thumbnail')          duration = parse_duration(self._search_regex( @@ -274,15 +279,16 @@ class XHamsterIE(InfoExtractor):  class XHamsterEmbedIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)' +    _VALID_URL = r'https?://(?:.+?\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'      _TEST = {          'url': 'http://xhamster.com/xembed.php?video=3328539',          'info_dict': {              'id': '3328539',              'ext': 'mp4',              'title': 'Pen Masturbation', +            'timestamp': 1406581861,              'upload_date': '20140728', -            'uploader_id': 'anonymous', +            'uploader': 'ManyakisArt',              'duration': 5,              'age_limit': 18,          } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index f0ba01197..c7947d4a1 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):          # request basic data          basic_data_params = {              'vid': video_id, -            'ccode': '0501', +            'ccode': '0507',              'client_ip': '192.168.1.1',              'utid': cna,              'client_ts': time.time() / 1000, @@ -241,6 +241,10 @@ class YoukuShowIE(InfoExtractor):          # Ongoing playlist. The initial page is the last one          'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',          'only_matching': True, +    }, { +        #  No data-id value. +        'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', +        'only_matching': True,      }]      def _extract_entries(self, playlist_data_url, show_id, note, query): @@ -276,9 +280,9 @@ class YoukuShowIE(InfoExtractor):              r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')          # The first reload_id has the same items as first_page          reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) +        entries.extend(initial_entries)          for idx, reload_id in enumerate(reload_ids):              if reload_id == first_page_reload_id: -                entries.extend(initial_entries)                  continue              _, new_entries = self._extract_entries(                  'http://list.youku.com/show/episode', show_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9943dddc1..0919bef0e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2270,6 +2270,19 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):              r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',              page, 'title', default=None) +        _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' +        uploader = self._search_regex( +            r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, +            page, 'uploader', default=None) +        mobj = re.search( +            r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, +            page) +        if mobj: +            uploader_id = mobj.group('uploader_id') +            uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) +        else: +            uploader_id = uploader_url = None +          has_videos = True          if not playlist_title: @@ -2280,8 +2293,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):              except StopIteration:                  has_videos = False -        return has_videos, self.playlist_result( +        playlist = self.playlist_result(              self._entries(page, playlist_id), playlist_id, playlist_title) +        playlist.update({ +            'uploader': uploader, +            'uploader_id': uploader_id, +            'uploader_url': uploader_url, +        }) + +        return has_videos, playlist      def _check_download_just_video(self, url, playlist_id):          # Check if it's a video-specific URL diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index fbdfa02ac..b0aed9ca7 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -42,6 +42,7 @@ class XAttrMetadataPP(PostProcessor):                  'user.dublincore.format': 'format',              } +            num_written = 0              for xattrname, infoname in xattr_mapping.items():                  value = info.get(infoname) @@ -52,6 +53,7 @@ class XAttrMetadataPP(PostProcessor):                      byte_value = value.encode('utf-8')                      write_xattr(filename, xattrname, byte_value) +                    num_written += 1              return [], info @@ -62,8 +64,8 @@ class XAttrMetadataPP(PostProcessor):          except XAttrMetadataError as e:              if e.reason == 'NO_SPACE':                  self._downloader.report_warning( -                    'There\'s no disk space left or disk quota exceeded. ' + -                    'Extended attributes are not written.') +                    'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + +                    (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize())              elif e.reason == 'VALUE_TOO_LONG':                  self._downloader.report_warning(                      'Unable to write extended attributes due to too long values.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index eccbc0b1f..2843a3dc0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -159,6 +159,8 @@ DATE_FORMATS = (      '%Y-%m-%dT%H:%M',      '%b %d %Y at %H:%M',      '%b %d %Y at %H:%M:%S', +    '%B %d %Y at %H:%M', +    '%B %d %Y at %H:%M:%S',  )  DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 88bf1d652..f999584d7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2017.12.02' +__version__ = '2017.12.23'  | 
