diff options
Diffstat (limited to 'youtube_dl')
96 files changed, 3937 insertions, 1672 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5036289b0..ba72ec6f3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -196,8 +196,8 @@ class YoutubeDL(object):      prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.                         At the moment, this is only supported by YouTube.      proxy:             URL of the proxy server to use -    cn_verification_proxy:  URL of the proxy to use for IP address verification -                       on Chinese sites. (Experimental) +    geo_verification_proxy:  URL of the proxy to use for IP address verification +                       on geo-restricted sites. (Experimental)      socket_timeout:    Time to wait for unresponsive hosts, in seconds      bidi_workaround:   Work around buggy terminals without bidirectional text                         support, using fridibi @@ -304,6 +304,11 @@ class YoutubeDL(object):          self.params.update(params)          self.cache = Cache(self) +        if self.params.get('cn_verification_proxy') is not None: +            self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.') +            if self.params.get('geo_verification_proxy') is None: +                self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] +          if params.get('bidi_workaround', False):              try:                  import pty diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4905674ad..2b34bf9c2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -382,6 +382,8 @@ def _real_main(argv=None):          'external_downloader_args': external_downloader_args,          'postprocessor_args': postprocessor_args,          'cn_verification_proxy': opts.cn_verification_proxy, +        'geo_verification_proxy': opts.geo_verification_proxy, +      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 67db1c7c6..b8aaf5a46 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import binascii @@ -2594,15 +2595,19 @@ except ImportError:  # Python < 3.3              return "'" + s.replace("'", "'\"'\"'") + "'" -if sys.version_info >= (2, 7, 3): +try: +    args = shlex.split('中文') +    assert (isinstance(args, list) and +            isinstance(args[0], compat_str) and +            args[0] == '中文')      compat_shlex_split = shlex.split -else: +except (AssertionError, UnicodeEncodeError):      # Working around shlex issue with unicode strings on some python 2      # versions (see http://bugs.python.org/issue1548891)      def compat_shlex_split(s, comments=False, posix=True):          if isinstance(s, compat_str):              s = s.encode('utf-8') -        return shlex.split(s, comments, posix) +        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))  def compat_ord(c): diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 8f88b0241..80c21d40b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -196,6 +196,11 @@ def build_fragments_list(boot_info):      first_frag_number = fragment_run_entry_table[0]['first']      fragments_counter = itertools.count(first_frag_number)      for segment, fragments_count in segment_run_table['segment_run']: +        # In some live HDS streams (for example Rai), `fragments_count` is +        # abnormal and causing out-of-memory errors. It's OK to change the +        # number of fragments for live streams as they are updated periodically +        if fragments_count == 4294967295 and boot_info['live']: +            fragments_count = 2          for _ in range(fragments_count):              res.append((segment, next(fragments_counter))) @@ -329,7 +334,11 @@ class F4mFD(FragmentFD):          base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])          bootstrap_node = doc.find(_add_ns('bootstrapInfo')) -        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) +        # From Adobe F4M 3.0 spec: +        # The <baseURL> element SHALL be the base URL for all relative +        # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said +        # URLs should be relative to the location of the containing document. +        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url)          live = boot_info['live']          metadata_node = media.find(_add_ns('metadata'))          if metadata_node is not None: diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe2641..8f53050c9 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -2,41 +2,33 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor +from .theplatform import ThePlatformIE  from ..utils import (      smuggle_url,      update_url_query,      unescapeHTML, +    extract_attributes, +    get_element_by_attribute,  ) +from ..compat import ( +    compat_urlparse, +) + +class AENetworksBaseIE(ThePlatformIE): +    _THEPLATFORM_KEY = 'crazyjava' +    _THEPLATFORM_SECRET = 's3cr3t' -class AENetworksIE(InfoExtractor): + +class AENetworksIE(AENetworksBaseIE):      IE_NAME = 'aenetworks'      IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' -    _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' - +    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)'      _TESTS = [{ -        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', -        'info_dict': { -            'id': 'g12m5Gyt3fdR', -            'ext': 'mp4', -            'title': "Bet You Didn't Know: Valentine's Day", -            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', -            'timestamp': 1375819729, -            'upload_date': '20130806', -            'uploader': 'AENE-NEW', -        }, -        'params': { -            # m3u8 download -            'skip_download': True, -        }, -        'add_ie': ['ThePlatform'], -        'expected_warnings': ['JSON-LD'], -    }, {          'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',          'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',          'info_dict': { -            'id': 'eg47EERs_JsZ', +            'id': '22253814',              'ext': 'mp4',              'title': 'Winter Is Coming',              'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', @@ -46,42 +38,168 @@ class AENetworksIE(InfoExtractor):          },          'add_ie': ['ThePlatform'],      }, { -        'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', +        'url': 'http://www.history.com/shows/ancient-aliens/season-1', +        'info_dict': { +            'id': '71889446852', +        }, +        'playlist_mincount': 5, +    }, { +        'url': 'http://www.mylifetime.com/shows/atlanta-plastic', +        'info_dict': { +            'id': 'SERIES4317', +            'title': 'Atlanta Plastic', +        }, +        'playlist_mincount': 2, +    }, { +        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',          'only_matching': True      }, { -        'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', +        'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',          'only_matching': True      }, { -        'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', +        'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', +        'only_matching': True +    }, { +        'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',          'only_matching': True      }] +    _DOMAIN_TO_REQUESTOR_ID = { +        'history.com': 'HISTORY', +        'aetv.com': 'AETV', +        'mylifetime.com': 'LIFETIME', +        'fyi.tv': 'FYI', +    }      def _real_extract(self, url): -        page_type, video_id = re.match(self._VALID_URL, url).groups() +        domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() +        display_id = show_path or movie_display_id +        webpage = self._download_webpage(url, display_id) +        if show_path: +            url_parts = show_path.split('/') +            url_parts_len = len(url_parts) +            if url_parts_len == 1: +                entries = [] +                for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): +                    entries.append(self.url_result( +                        compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) +                return self.playlist_result( +                    entries, self._html_search_meta('aetn:SeriesId', webpage), +                    self._html_search_meta('aetn:SeriesTitle', webpage)) +            elif url_parts_len == 2: +                entries = [] +                for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): +                    episode_attributes = extract_attributes(episode_item) +                    episode_url = compat_urlparse.urljoin( +                        url, episode_attributes['data-canonical']) +                    entries.append(self.url_result( +                        episode_url, 'AENetworks', +                        episode_attributes['data-videoid'])) +                return self.playlist_result( +                    entries, self._html_search_meta('aetn:SeasonId', webpage)) + +        query = { +            'mbr': 'true', +            'assetTypes': 'medium_video_s3' +        } +        video_id = self._html_search_meta('aetn:VideoID', webpage) +        media_url = self._search_regex( +            r"media_url\s*=\s*'([^']+)'", webpage, 'video url') +        theplatform_metadata = self._download_theplatform_metadata(self._search_regex( +            r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) +        info = self._parse_theplatform_metadata(theplatform_metadata) +        if theplatform_metadata.get('AETN$isBehindWall'): +            requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] +            resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s</title><item><title>%s</title><guid>%s</guid><media:rating scheme="urn:v-chip">%s</media:rating></item></channel></rss>' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) +            query['auth'] = self._extract_mvpd_auth( +                url, video_id, requestor_id, resource) +        info.update(self._search_json_ld(webpage, video_id, fatal=False)) +        media_url = update_url_query(media_url, query) +        media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) +        formats, subtitles = self._extract_theplatform_smil(media_url, video_id) +        self._sort_formats(formats) +        info.update({ +            'id': video_id, +            'formats': formats, +            'subtitles': subtitles, +        }) +        return info -        webpage = self._download_webpage(url, video_id) -        video_url_re = [ -            r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, -            r"media_url\s*=\s*'([^']+)'" -        ] -        video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) -        query = {'mbr': 'true'} -        if page_type == 'shows': -            query['assetTypes'] = 'medium_video_s3' -        if 'switch=hds' in video_url: -            query['switch'] = 'hls' +class HistoryTopicIE(AENetworksBaseIE): +    IE_NAME = 'history:topic' +    IE_DESC = 'History.com Topic' +    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?' +    _TESTS = [{ +        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', +        'info_dict': { +            'id': '40700995724', +            'ext': 'mp4', +            'title': "Bet You Didn't Know: Valentine's Day", +            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', +            'timestamp': 1375819729, +            'upload_date': '20130806', +            'uploader': 'AENE-NEW', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +        'add_ie': ['ThePlatform'], +    }, { +        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', +        'info_dict': +        { +            'id': 'world-war-i-history', +            'title': 'World War I History', +        }, +        'playlist_mincount': 24, +    }, { +        'url': 'http://www.history.com/topics/world-war-i-history/videos', +        'only_matching': True, +    }, { +        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', +        'only_matching': True, +    }, { +        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', +        'only_matching': True, +    }] -        info = self._search_json_ld(webpage, video_id, fatal=False) -        info.update({ +    def theplatform_url_result(self, theplatform_url, video_id, query): +        return {              '_type': 'url_transparent', +            'id': video_id,              'url': smuggle_url( -                update_url_query(video_url, query), +                update_url_query(theplatform_url, query),                  {                      'sig': { -                        'key': 'crazyjava', -                        'secret': 's3cr3t'}, +                        'key': self._THEPLATFORM_KEY, +                        'secret': self._THEPLATFORM_SECRET, +                    },                      'force_smil_url': True                  }), -        }) -        return info +            'ie_key': 'ThePlatform', +        } + +    def _real_extract(self, url): +        topic_id, video_display_id = re.match(self._VALID_URL, url).groups() +        if video_display_id: +            webpage = self._download_webpage(url, video_display_id) +            release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() +            release_url = unescapeHTML(release_url) + +            return self.theplatform_url_result( +                release_url, video_id, { +                    'mbr': 'true', +                    'switch': 'hls' +                }) +        else: +            webpage = self._download_webpage(url, topic_id) +            entries = [] +            for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage): +                video_attributes = extract_attributes(episode_item) +                entries.append(self.theplatform_url_result( +                    video_attributes['data-release-url'], video_attributes['data-id'], { +                        'mbr': 'true', +                        'switch': 'hls' +                    })) +            return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 8545681be..e8e40126b 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -5,6 +5,8 @@ from .common import InfoExtractor  from ..utils import (      int_or_none,      parse_iso8601, +    mimetype2ext, +    determine_ext,  ) @@ -50,21 +52,25 @@ class AMPIE(InfoExtractor):          if isinstance(media_content, dict):              media_content = [media_content]          for media_data in media_content: -            media = media_data['@attributes'] -            media_type = media['type'] -            if media_type in ('video/f4m', 'application/f4m+xml'): +            media = media_data.get('@attributes', {}) +            media_url = media.get('url') +            if not media_url: +                continue +            ext = mimetype2ext(media.get('type')) or determine_ext(media_url) +            if ext == 'f4m':                  formats.extend(self._extract_f4m_formats( -                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', +                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',                      video_id, f4m_id='hds', fatal=False)) -            elif media_type == 'application/x-mpegURL': +            elif ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats( -                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) +                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))              else:                  formats.append({                      'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),                      'url': media['url'],                      'tbr': int_or_none(media.get('bitrate')),                      'filesize': int_or_none(media.get('fileSize')), +                    'ext': ext,                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9b01e38f5..9e28f2579 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor):      _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'      _NETRC_MACHINE = 'animeondemand'      _TESTS = [{ +        # jap, OmU          'url': 'https://www.anime-on-demand.de/anime/161',          'info_dict': {              'id': '161', @@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor):          },          'playlist_mincount': 4,      }, { -        # Film wording is used instead of Episode +        # Film wording is used instead of Episode, ger/jap, Dub/OmU          'url': 'https://www.anime-on-demand.de/anime/39',          'only_matching': True,      }, { -        # Episodes without titles +        # Episodes without titles, jap, OmU          'url': 'https://www.anime-on-demand.de/anime/162',          'only_matching': True,      }, {          # ger/jap, Dub/OmU, account required          'url': 'https://www.anime-on-demand.de/anime/169',          'only_matching': True, +    }, { +        # Full length film, non-series, ger/jap, Dub/OmU, account required +        'url': 'https://www.anime-on-demand.de/anime/185', +        'only_matching': True,      }]      def _login(self): @@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor):          entries = [] -        for num, episode_html in enumerate(re.findall( -                r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): -            episodebox_title = self._search_regex( -                (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', -                 r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), -                episode_html, 'episodebox title', default=None, group='title') -            if not episodebox_title: -                continue - -            episode_number = int(self._search_regex( -                r'(?:Episode|Film)\s*(\d+)', -                episodebox_title, 'episode number', default=num)) -            episode_title = self._search_regex( -                r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', -                episodebox_title, 'episode title', default=None) - -            video_id = 'episode-%d' % episode_number - -            common_info = { -                'id': video_id, -                'series': anime_title, -                'episode': episode_title, -                'episode_number': episode_number, -            } - +        def extract_info(html, video_id, num=None): +            title, description = [None] * 2              formats = []              for input_ in re.findall( -                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): +                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):                  attributes = extract_attributes(input_)                  playlist_urls = []                  for playlist_key in ('data-playlist', 'data-otherplaylist'): @@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor):                          format_id_list.append(lang)                      if kind:                          format_id_list.append(kind) -                    if not format_id_list: +                    if not format_id_list and num is not None:                          format_id_list.append(compat_str(num))                      format_id = '-'.join(format_id_list)                      format_note = ', '.join(filter(None, (kind, lang_note))) @@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor):                              })                          formats.extend(file_formats) -            if formats: -                self._sort_formats(formats) +            return { +                'title': title, +                'description': description, +                'formats': formats, +            } + +        def extract_entries(html, video_id, common_info, num=None): +            info = extract_info(html, video_id, num) + +            if info['formats']: +                self._sort_formats(info['formats'])                  f = common_info.copy() -                f.update({ -                    'title': title, -                    'description': description, -                    'formats': formats, -                }) +                f.update(info)                  entries.append(f) -            # Extract teaser only when full episode is not available -            if not formats: +            # Extract teaser/trailer only when full episode is not available +            if not info['formats']:                  m = re.search( -                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', -                    episode_html) +                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', +                    html)                  if m:                      f = common_info.copy()                      f.update({ -                        'id': '%s-teaser' % f['id'], +                        'id': '%s-%s' % (f['id'], m.group('kind').lower()),                          'title': m.group('title'),                          'url': compat_urlparse.urljoin(url, m.group('href')),                      })                      entries.append(f) +        def extract_episodes(html): +            for num, episode_html in enumerate(re.findall( +                    r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): +                episodebox_title = self._search_regex( +                    (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', +                     r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), +                    episode_html, 'episodebox title', default=None, group='title') +                if not episodebox_title: +                    continue + +                episode_number = int(self._search_regex( +                    r'(?:Episode|Film)\s*(\d+)', +                    episodebox_title, 'episode number', default=num)) +                episode_title = self._search_regex( +                    r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', +                    episodebox_title, 'episode title', default=None) + +                video_id = 'episode-%d' % episode_number + +                common_info = { +                    'id': video_id, +                    'series': anime_title, +                    'episode': episode_title, +                    'episode_number': episode_number, +                } + +                extract_entries(episode_html, video_id, common_info) + +        def extract_film(html, video_id): +            common_info = { +                'id': anime_id, +                'title': anime_title, +                'description': anime_description, +            } +            extract_entries(html, video_id, common_info) + +        extract_episodes(webpage) + +        if not entries: +            extract_film(webpage, anime_id) +          return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..a6801f3d4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor  from ..compat import compat_urlparse  from ..utils import (      int_or_none, +    parse_duration, +    unified_strdate,  ) @@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor):      _TESTS = [{          'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',          'info_dict': { -            'id': 'manofsteel', +            'id': '5111', +            'title': 'Man of Steel',          },          'playlist': [              { @@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor):              'id': 'blackthorn',          },          'playlist_mincount': 2, +        'expected_warnings': ['Unable to download JSON metadata'], +    }, { +        # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json +        'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', +        'info_dict': { +            'id': '15881', +            'title': 'Kung Fu Panda 3', +        }, +        'playlist_mincount': 4,      }, {          'url': 'http://trailers.apple.com/ca/metropole/autrui/',          'only_matching': True, @@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor):          movie = mobj.group('movie')          uploader_id = mobj.group('company') +        webpage = self._download_webpage(url, movie) +        film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') +        film_data = self._download_json( +            'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, +            film_id, fatal=False) + +        if film_data: +            entries = [] +            for clip in film_data.get('clips', []): +                clip_title = clip['title'] + +                formats = [] +                for version, version_data in clip.get('versions', {}).items(): +                    for size, size_data in version_data.get('sizes', {}).items(): +                        src = size_data.get('src') +                        if not src: +                            continue +                        formats.append({ +                            'format_id': '%s-%s' % (version, size), +                            'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), +                            'width': int_or_none(size_data.get('width')), +                            'height': int_or_none(size_data.get('height')), +                            'language': version[:2], +                        }) +                self._sort_formats(formats) + +                entries.append({ +                    'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), +                    'formats': formats, +                    'title': clip_title, +                    'thumbnail': clip.get('screen') or clip.get('thumb'), +                    'duration': parse_duration(clip.get('runtime') or clip.get('faded')), +                    'upload_date': unified_strdate(clip.get('posted')), +                    'uploader_id': uploader_id, +                }) + +            page_data = film_data.get('page', {}) +            return self.playlist_result(entries, film_id, page_data.get('movie_title')) +          playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')          def fix_html(s): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index fd45b3e42..13a06396d 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import (      parse_duration,      unified_strdate,      xpath_text, +    update_url_query,  )  from ..compat import compat_etree_fromstring @@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor):              # m3u8 download              'skip_download': True,          }, +        'skip': 'HTTP Error 404: Not Found',      }, {          'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',          'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', @@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor):              'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',              'duration': 5252,          }, +        'skip': 'HTTP Error 404: Not Found',      }, {          # audio          'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -55,6 +58,7 @@ class ARDMediathekIE(InfoExtractor):              'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',              'duration': 3240,          }, +        'skip': 'HTTP Error 404: Not Found',      }, {          'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',          'only_matching': True, @@ -113,11 +117,14 @@ class ARDMediathekIE(InfoExtractor):                          continue                      if ext == 'f4m':                          formats.extend(self._extract_f4m_formats( -                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', -                            video_id, preference=-1, f4m_id='hds', fatal=False)) +                            update_url_query(stream_url, { +                                'hdcore': '3.1.1', +                                'plugin': 'aasp-3.1.1.69.124' +                            }), +                            video_id, f4m_id='hds', fatal=False))                      elif ext == 'm3u8':                          formats.extend(self._extract_m3u8_formats( -                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) +                            stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))                      else:                          if server and server.startswith('rtmp'):                              f = { @@ -231,7 +238,8 @@ class ARDIE(InfoExtractor):              'title': 'Die Story im Ersten: Mission unter falscher Flagge',              'upload_date': '20140804',              'thumbnail': 're:^https?://.*\.jpg$', -        } +        }, +        'skip': 'HTTP Error 404: Not Found',      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 049f1fa9e..e0c5c1804 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE):          'info_dict': {              'id': 'PL-013263',              'title': 'Areva & Uramin', +            'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',          },          'playlist_mincount': 6,      }, { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ef560b592..57ce0c174 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -90,6 +90,7 @@ class BrightcoveLegacyIE(InfoExtractor):                  'description': 'md5:363109c02998fee92ec02211bd8000df',                  'uploader': 'National Ballet of Canada',              }, +            'skip': 'Video gone',          },          {              # test flv videos served by akamaihd.net @@ -108,7 +109,7 @@ class BrightcoveLegacyIE(InfoExtractor):              },          },          { -            # playlist test +            # playlist with 'videoList'              # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players              'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',              'info_dict': { @@ -117,6 +118,15 @@ class BrightcoveLegacyIE(InfoExtractor):              },              'playlist_mincount': 7,          }, +        { +            # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965) +            'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', +            'info_dict': { +                'id': '1522758701001', +                'title': 'Lesson 08', +            }, +            'playlist_mincount': 10, +        },      ]      FLV_VCODECS = {          1: 'SORENSON', @@ -298,13 +308,19 @@ class BrightcoveLegacyIE(InfoExtractor):              info_url, player_key, 'Downloading playlist information')          json_data = json.loads(playlist_info) -        if 'videoList' not in json_data: +        if 'videoList' in json_data: +            playlist_info = json_data['videoList'] +            playlist_dto = playlist_info['mediaCollectionDTO'] +        elif 'playlistTabs' in json_data: +            playlist_info = json_data['playlistTabs'] +            playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] +        else:              raise ExtractorError('Empty playlist') -        playlist_info = json_data['videoList'] -        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + +        videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']]          return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], -                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName']) +                                    playlist_title=playlist_dto['displayName'])      def _extract_video_info(self, video_info):          video_id = compat_str(video_info['id']) @@ -585,6 +601,13 @@ class BrightcoveNewIE(InfoExtractor):                          'format_id': build_format_id('rtmp'),                      })                  formats.append(f) + +        errors = json_data.get('errors') +        if not formats and errors: +            error = errors[0] +            raise ExtractorError( +                error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) +          self._sort_formats(formats)          subtitles = {} diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index df503ecc0..75fa92d7c 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -5,6 +5,7 @@ import json  import re  from .common import InfoExtractor +from .facebook import FacebookIE  class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor):              'info_dict': {                  'id': 'aVCR29aE_OQ',                  'ext': 'mp4', +                'title': 'Angry Ram destroys a punching bag..', +                'description': 'md5:c59533190ef23fd4458a5e8c8c872345',                  'upload_date': '20141024',                  'uploader_id': 'Buddhanz1', -                'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', -                'uploader': 'Buddhanz', -                'title': 'Angry Ram destroys a punching bag', +                'uploader': 'Angry Ram',              }          }]      }, { @@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor):              'info_dict': {                  'id': 'mVmBL8B-In0',                  'ext': 'mp4', +                'title': 're:Munchkin the Teddy Bear gets her exercise', +                'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',                  'upload_date': '20141124',                  'uploader_id': 'CindysMunchkin', -                'description': 're:© 2014 Munchkin the',                  'uploader': 're:^Munchkin the', -                'title': 're:Munchkin the Teddy Bear gets her exercise',              },          }] +    }, { +        'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', +        'info_dict': { +            'id': 'the-most-adorable-crash-landing-ever', +            'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', +            'description': 'This gosling knows how to stick a landing.', +        }, +        'playlist': [{ +            'md5': '763ca415512f91ca62e4621086900a23', +            'info_dict': { +                'id': '971793786185728', +                'ext': 'mp4', +                'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', +                'uploader': 'Calgary Outdoor Centre-University of Calgary', +            }, +        }], +        'add_ie': ['Facebook'],      }]      def _real_extract(self, url): @@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor):                  continue              entries.append(self.url_result(video['url'])) +        facebook_url = FacebookIE._extract_url(webpage) +        if facebook_url: +            entries.append(self.url_result(facebook_url)) +          return {              '_type': 'playlist',              'id': playlist_id, diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 0011c3029..821db20b2 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE):          media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])          formats, subtitles = [], {} -        if site == 'cnet': -            formats, subtitles = self._extract_theplatform_smil( -                self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)          for (fkey, vid) in vdata['files'].items():              if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:                  continue @@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):              subtitles = self._merge_subtitles(subtitles, tp_subtitles)          self._sort_formats(formats) -        info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) +        info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)          info.update({              'id': video_id,              'display_id': display_id, diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py index 4f9320ea5..d55b26d59 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/youtube_dl/extractor/cliprs.py @@ -1,16 +1,10 @@  # coding: utf-8  from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -    float_or_none, -    int_or_none, -    parse_iso8601, -) +from .onet import OnetBaseIE -class ClipRsIE(InfoExtractor): +class ClipRsIE(OnetBaseIE):      _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'      _TEST = {          'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor):      }      def _real_extract(self, url): -        video_id = self._match_id(url) +        display_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage(url, display_id) -        video_id = self._search_regex( -            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') +        mvp_id = self._search_mvp_id(webpage) -        response = self._download_json( -            'http://qi.ckm.onetapi.pl/', video_id, -            query={ -                'body[id]': video_id, -                'body[jsonrpc]': '2.0', -                'body[method]': 'get_asset_detail', -                'body[params][ID_Publikacji]': video_id, -                'body[params][Service]': 'www.onet.pl', -                'content-type': 'application/jsonp', -                'x-onet-app': 'player.front.onetapi.pl', -            }) +        info_dict = self._extract_from_id(mvp_id, webpage) +        info_dict['display_id'] = display_id -        error = response.get('error') -        if error: -            raise ExtractorError( -                '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - -        video = response['result'].get('0') - -        formats = [] -        for _, formats_dict in video['formats'].items(): -            if not isinstance(formats_dict, dict): -                continue -            for format_id, format_list in formats_dict.items(): -                if not isinstance(format_list, list): -                    continue -                for f in format_list: -                    if not f.get('url'): -                        continue -                    formats.append({ -                        'url': f['url'], -                        'format_id': format_id, -                        'height': int_or_none(f.get('vertical_resolution')), -                        'width': int_or_none(f.get('horizontal_resolution')), -                        'abr': float_or_none(f.get('audio_bitrate')), -                        'vbr': float_or_none(f.get('video_bitrate')), -                    }) -        self._sort_formats(formats) - -        meta = video.get('meta', {}) - -        title = self._og_search_title(webpage, default=None) or meta['title'] -        description = self._og_search_description(webpage, default=None) or meta.get('description') -        duration = meta.get('length') or meta.get('lenght') -        timestamp = parse_iso8601(meta.get('addDate'), ' ') - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'duration': duration, -            'timestamp': timestamp, -            'formats': formats, -        } +        return info_dict diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 661889593..df546da27 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import (      sanitized_Request,      unescapeHTML,      unified_strdate, +    unified_timestamp,      url_basename,      xpath_element,      xpath_text, @@ -163,6 +164,7 @@ class InfoExtractor(object):                          * "height" (optional, int)                          * "resolution" (optional, string "{width}x{height"},                                          deprecated) +                        * "filesize" (optional, int)      thumbnail:      Full URL to a video thumbnail image.      description:    Full video description.      uploader:       Full name of the video uploader. @@ -751,10 +753,12 @@ class InfoExtractor(object):          return self._og_search_property('url', html, **kargs)      def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): +        if not isinstance(name, (list, tuple)): +            name = [name]          if display_name is None: -            display_name = name +            display_name = name[0]          return self._html_search_regex( -            self._meta_regex(name), +            [self._meta_regex(n) for n in name],              html, display_name, fatal=fatal, group='content', **kwargs)      def _dc_search_uploader(self, html): @@ -803,15 +807,17 @@ class InfoExtractor(object):          return self._html_search_meta('twitter:player', html,                                        'twitter card player') -    def _search_json_ld(self, html, video_id, **kwargs): +    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):          json_ld = self._search_regex(              r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',              html, 'JSON-LD', group='json_ld', **kwargs)          if not json_ld:              return {} -        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) +        return self._json_ld( +            json_ld, video_id, fatal=kwargs.get('fatal', True), +            expected_type=expected_type) -    def _json_ld(self, json_ld, video_id, fatal=True): +    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):          if isinstance(json_ld, compat_str):              json_ld = self._parse_json(json_ld, video_id, fatal=fatal)          if not json_ld: @@ -819,6 +825,8 @@ class InfoExtractor(object):          info = {}          if json_ld.get('@context') == 'http://schema.org':              item_type = json_ld.get('@type') +            if expected_type is not None and expected_type != item_type: +                return info              if item_type == 'TVEpisode':                  info.update({                      'episode': unescapeHTML(json_ld.get('name')), @@ -837,6 +845,19 @@ class InfoExtractor(object):                      'title': unescapeHTML(json_ld.get('headline')),                      'description': unescapeHTML(json_ld.get('articleBody')),                  }) +            elif item_type == 'VideoObject': +                info.update({ +                    'url': json_ld.get('contentUrl'), +                    'title': unescapeHTML(json_ld.get('name')), +                    'description': unescapeHTML(json_ld.get('description')), +                    'thumbnail': json_ld.get('thumbnailUrl'), +                    'duration': parse_duration(json_ld.get('duration')), +                    'timestamp': unified_timestamp(json_ld.get('uploadDate')), +                    'filesize': float_or_none(json_ld.get('contentSize')), +                    'tbr': int_or_none(json_ld.get('bitrate')), +                    'width': int_or_none(json_ld.get('width')), +                    'height': int_or_none(json_ld.get('height')), +                })          return dict((k, v) for k, v in info.items() if v is not None)      @staticmethod @@ -878,7 +899,11 @@ class InfoExtractor(object):                  f['ext'] = determine_ext(f['url'])              if isinstance(field_preference, (list, tuple)): -                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) +                return tuple( +                    f.get(field) +                    if f.get(field) is not None +                    else ('' if field == 'format_id' else -1) +                    for field in field_preference)              preference = f.get('preference')              if preference is None: @@ -1781,6 +1806,13 @@ class InfoExtractor(object):      def _mark_watched(self, *args, **kwargs):          raise NotImplementedError('This method must be implemented by subclasses') +    def geo_verification_headers(self): +        headers = {} +        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') +        if geo_verification_proxy: +            headers['Ytdl-request-proxy'] = geo_verification_proxy +        return headers +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py new file mode 100644 index 000000000..5807fbac9 --- /dev/null +++ b/youtube_dl/extractor/ctv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' +    _TESTS = [{ +        'url': 'http://www.ctv.ca/video/player?vid=706966', +        'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', +        'info_dict': { +            'id': '706966', +            'ext': 'mp4', +            'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', +            'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', +            'upload_date': '20150919', +            'timestamp': 1442624700, +        }, +        'expected_warnings': ['HTTP Error 404'], +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'url': '9c9media:ctv_web:%s' % video_id, +            'ie_key': 'NineCNineMedia', +        } diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py new file mode 100644 index 000000000..1023b6130 --- /dev/null +++ b/youtube_dl/extractor/ctvnews.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import orderedSet + + +class CTVNewsIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' +    _TESTS = [{ +        'url': 'http://www.ctvnews.ca/video?clipId=901995', +        'md5': '10deb320dc0ccb8d01d34d12fc2ea672', +        'info_dict': { +            'id': '901995', +            'ext': 'mp4', +            'title': 'Extended: \'That person cannot be me\' Johnson says', +            'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', +            'timestamp': 1467286284, +            'upload_date': '20160630', +        } +    }, { +        'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', +        'info_dict': +        { +            'id': '1.2966224', +        }, +        'playlist_mincount': 19, +    }, { +        'url': 'http://www.ctvnews.ca/video?binId=1.2876780', +        'info_dict': +        { +            'id': '1.2876780', +        }, +        'playlist_mincount': 100, +    }, { +        'url': 'http://www.ctvnews.ca/1.810401', +        'only_matching': True, +    }, { +        'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        page_id = self._match_id(url) + +        def ninecninemedia_url_result(clip_id): +            return { +                '_type': 'url_transparent', +                'id': clip_id, +                'url': '9c9media:ctvnews_web:%s' % clip_id, +                'ie_key': 'NineCNineMedia', +            } + +        if page_id.isdigit(): +            return ninecninemedia_url_result(page_id) +        else: +            webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ +                'ot': 'example.AjaxPageLayout.ot', +                'maxItemsPerPage': 1000000, +            }) +            entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( +                re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] +            return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2e6226ea0..1f92823b7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -16,6 +16,7 @@ from ..utils import (      sanitized_Request,      str_to_int,      unescapeHTML, +    mimetype2ext,  ) @@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):          }      ] +    @staticmethod +    def _extract_urls(webpage): +        # Look for embedded Dailymotion player +        matches = re.findall( +            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) +        return list(map(lambda m: unescapeHTML(m[1]), matches)) +      def _real_extract(self, url):          video_id = self._match_id(url) @@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor):                      type_ = media.get('type')                      if type_ == 'application/vnd.lumberjack.manifest':                          continue -                    ext = determine_ext(media_url) -                    if type_ == 'application/x-mpegURL' or ext == 'm3u8': +                    ext = mimetype2ext(type_) or determine_ext(media_url) +                    if ext == 'm3u8':                          formats.extend(self._extract_m3u8_formats(                              media_url, video_id, 'mp4', preference=-1,                              m3u8_id='hls', fatal=False)) -                    elif type_ == 'application/f4m' or ext == 'f4m': +                    elif ext == 'f4m':                          formats.extend(self._extract_f4m_formats(                              media_url, video_id, preference=-1, f4m_id='hds', fatal=False))                      else:                          f = {                              'url': media_url,                              'format_id': 'http-%s' % quality, +                            'ext': ext,                          }                          m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)                          if m: diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 86024a745..b5c310ccb 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -66,22 +66,32 @@ class DaumIE(InfoExtractor):              'view_count': int,              'comment_count': int,          }, +    }, { +        # Requires dte_type=WEB (#9972) +        'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', +        'md5': 'a8917742069a4dd442516b86e7d66529', +        'info_dict': { +            'id': 's3794Uf1NZeZ1qMpGpeqeRU', +            'ext': 'mp4', +            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', +            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', +            'upload_date': '20160611', +        },      }]      def _real_extract(self, url):          video_id = compat_urllib_parse_unquote(self._match_id(url)) -        query = compat_urllib_parse_urlencode({'vid': video_id})          movie_data = self._download_json( -            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, -            video_id, 'Downloading video formats info') +            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', +            video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})          # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid          if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):              return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)          info = self._download_xml( -            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, -            'Downloading video info') +            'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, +            'Downloading video info', query={'vid': video_id})          formats = []          for format_el in movie_data['output_list']['output_list']: diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5deff5f30..efb8585e8 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -20,7 +20,7 @@ from ..utils import (  class DCNIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' +    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'      def _real_extract(self, url):          show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() @@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor):              'is_live': is_live,          } -    def _extract_video_formats(self, webpage, video_id, entry_protocol): +    def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol):          formats = [] -        m3u8_url = self._html_search_regex( -            r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) -        if m3u8_url: -            formats.extend(self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) - -        rtsp_url = self._search_regex( -            r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) -        if rtsp_url: -            formats.append({ -                'url': rtsp_url, -                'format_id': 'rtsp', -            }) - +        format_url_base = 'http' + self._html_search_regex( +            [ +                r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', +                r'<a[^>]+href="rtsp(://[^"]+)"' +            ], webpage, 'format url') +        # TODO: Current DASH formats are broken - $Time$ pattern in +        # <SegmentTemplate> not implemented yet +        # formats.extend(self._extract_mpd_formats( +        #     format_url_base + '/manifest.mpd', +        #     video_id, mpd_id='dash', fatal=False)) +        formats.extend(self._extract_m3u8_formats( +            format_url_base + '/playlist.m3u8', video_id, 'mp4', +            m3u8_entry_protocol, m3u8_id='hls', fatal=False)) +        formats.extend(self._extract_f4m_formats( +            format_url_base + '/manifest.f4m', +            video_id, f4m_id='hds', fatal=False))          self._sort_formats(formats)          return formats  class DCNVideoIE(DCNBaseIE):      IE_NAME = 'dcn:video' -    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' +    _TESTS = [{          'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',          'info_dict':          { @@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE):              # m3u8 download              'skip_download': True,          }, -    } +    }, { +        'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE):  class DCNLiveIE(DCNBaseIE):      IE_NAME = 'dcn:live' -    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'      def _real_extract(self, url):          channel_id = self._match_id(url) @@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE):  class DCNSeasonIE(InfoExtractor):      IE_NAME = 'dcn:season' -    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' +    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'      _TEST = {          'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',          'info_dict': diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 113a4966f..12d28d3b9 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -51,6 +51,14 @@ class EaglePlatformIE(InfoExtractor):      }]      @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', +            webpage) +        if mobj is not None: +            return mobj.group('url') + +    @staticmethod      def _handle_error(response):          status = int_or_none(response.get('status', 200))          if status != 200: diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1b04f2fc..864c9af68 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,7 +20,10 @@ from .adobetv import (      AdobeTVVideoIE,  )  from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( +    AENetworksIE, +    HistoryTopicIE, +)  from .afreecatv import AfreecaTVIE  from .aftonbladet import AftonbladetIE  from .airmozilla import AirMozillaIE @@ -136,9 +139,9 @@ from .chirbit import (      ChirbitProfileIE,  )  from .cinchcast import CinchcastIE -from .cliprs import ClipRsIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE +from .cliprs import ClipRsIE  from .clipsyndicate import ClipsyndicateIE  from .closertotruth import CloserToTruthIE  from .cloudy import CloudyIE @@ -168,6 +171,8 @@ from .crunchyroll import (  )  from .cspan import CSpanIE  from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE  from .cultureunplugged import CultureUnpluggedIE  from .cwtv import CWTVIE  from .dailymail import DailyMailIE @@ -251,6 +256,7 @@ from .fivemin import FiveMinIE  from .fivetv import FiveTVIE  from .fktv import FKTVIE  from .flickr import FlickrIE +from .flipagram import FlipagramIE  from .folketinget import FolketingetIE  from .footyroom import FootyRoomIE  from .formula1 import Formula1IE @@ -276,6 +282,7 @@ from .freespeech import FreespeechIE  from .freevideo import FreeVideoIE  from .funimation import FunimationIE  from .funnyordie import FunnyOrDieIE +from .fusion import FusionIE  from .gameinformer import GameInformerIE  from .gamekings import GamekingsIE  from .gameone import ( @@ -285,7 +292,6 @@ from .gameone import (  from .gamersyde import GamersydeIE  from .gamespot import GameSpotIE  from .gamestar import GameStarIE -from .gametrailers import GametrailersIE  from .gazeta import GazetaIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE @@ -321,6 +327,10 @@ from .hotnewhiphop import HotNewHipHopIE  from .hotstar import HotStarIE  from .howcast import HowcastIE  from .howstuffworks import HowStuffWorksIE +from .hrti import ( +    HRTiIE, +    HRTiPlaylistIE, +)  from .huffpost import HuffPostIE  from .hypem import HypemIE  from .iconosquare import IconosquareIE @@ -359,6 +369,7 @@ from .jove import JoveIE  from .jwplatform import JWPlatformIE  from .jpopsukitv import JpopsukiIE  from .kaltura import KalturaIE +from .kamcord import KamcordIE  from .kanalplay import KanalPlayIE  from .kankan import KankanIE  from .karaoketv import KaraoketvIE @@ -423,6 +434,7 @@ from .makerschannel import MakersChannelIE  from .makertv import MakerTVIE  from .matchtv import MatchTVIE  from .mdr import MDRIE +from .meta import METAIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mgoon import MgoonIE @@ -455,6 +467,7 @@ from .motherless import MotherlessIE  from .motorsport import MotorsportIE  from .movieclips import MovieClipsIE  from .moviezine import MoviezineIE +from .msn import MSNIE  from .mtv import (      MTVIE,      MTVServicesEmbeddedIE, @@ -481,7 +494,6 @@ from .nbc import (      NBCNewsIE,      NBCSportsIE,      NBCSportsVPlayerIE, -    MSNBCIE,  )  from .ndr import (      NDRIE, @@ -523,6 +535,7 @@ from .nick import (      NickDeIE,  )  from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninecninemedia import NineCNineMediaIE  from .ninegag import NineGagIE  from .noco import NocoIE  from .normalboots import NormalbootsIE @@ -570,6 +583,10 @@ from .nytimes import (  from .nuvid import NuvidIE  from .odnoklassniki import OdnoklassnikiIE  from .oktoberfesttv import OktoberfestTVIE +from .onet import ( +    OnetIE, +    OnetChannelIE, +)  from .onionstudios import OnionStudiosIE  from .ooyala import (      OoyalaIE, @@ -608,6 +625,7 @@ from .pluralsight import (      PluralsightCourseIE,  )  from .podomatic import PodomaticIE +from .polskieradio import PolskieRadioIE  from .porn91 import Porn91IE  from .pornhd import PornHdIE  from .pornhub import ( @@ -662,6 +680,7 @@ from .rice import RICEIE  from .ringtv import RingTVIE  from .ro220 import Ro220IE  from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE  from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtbf import RTBFIE @@ -706,10 +725,12 @@ from .shahid import ShahidIE  from .shared import SharedIE  from .sharesix import ShareSixIE  from .sina import SinaIE +from .sixplay import SixPlayIE  from .skynewsarabia import (      SkyNewsArabiaIE,      SkyNewsArabiaArticleIE,  ) +from .skysports import SkySportsIE  from .slideshare import SlideshareIE  from .slutload import SlutloadIE  from .smotri import ( @@ -891,6 +912,7 @@ from .udn import UDNEmbedIE  from .digiteka import DigitekaIE  from .unistra import UnistraIE  from .urort import UrortIE +from .urplay import URPlayIE  from .usatoday import USATodayIE  from .ustream import UstreamIE, UstreamChannelIE  from .ustudio import ( @@ -917,6 +939,7 @@ from .vice import (      ViceIE,      ViceShowIE,  ) +from .vidbit import VidbitIE  from .viddler import ViddlerIE  from .videodetective import VideoDetectiveIE  from .videofyme import VideofyMeIE @@ -1050,6 +1073,7 @@ from .youtube import (      YoutubeSearchDateIE,      YoutubeSearchIE,      YoutubeSearchURLIE, +    YoutubeSharedVideoIE,      YoutubeShowIE,      YoutubeSubscriptionsIE,      YoutubeTruncatedIDIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5bbd39d2..cdb093262 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def _extract_url(webpage): +        mobj = re.search( +            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) +        if mobj is not None: +            return mobj.group('url') + +        # Facebook API embed +        # see https://developers.facebook.com/docs/plugins/embedded-video-player +        mobj = re.search(r'''(?x)<div[^>]+ +                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ +                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) +        if mobj is not None: +            return mobj.group('url') +      def _login(self):          (useremail, password) = self._get_login_info()          if useremail is None: @@ -204,12 +219,25 @@ class FacebookIE(InfoExtractor):          BEFORE = '{swf.addParam(param[0], param[1]);});'          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' -        m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) -        if m: -            swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') +        PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + +        for m in re.findall(PATTERN, webpage): +            swf_params = m.replace('\\\\', '\\').replace('\\"', '"')              data = dict(json.loads(swf_params))              params_raw = compat_urllib_parse_unquote(data['params']) -            video_data = json.loads(params_raw)['video_data'] +            video_data_candidate = json.loads(params_raw)['video_data'] +            for _, f in video_data_candidate.items(): +                if not f: +                    continue +                if isinstance(f, dict): +                    f = [f] +                if not isinstance(f, list): +                    continue +                if f[0].get('video_id') == video_id: +                    video_data = video_data_candidate +                    break +            if video_data: +                break          def video_data_list2dict(video_data):              ret = {} @@ -239,6 +267,8 @@ class FacebookIE(InfoExtractor):          formats = []          for format_id, f in video_data.items(): +            if f and isinstance(f, dict): +                f = [f]              if not f or not isinstance(f, list):                  continue              for quality in ('sd', 'hd'): diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py new file mode 100644 index 000000000..acb6133ff --- /dev/null +++ b/youtube_dl/extractor/flipagram.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    float_or_none, +    try_get, +    unified_timestamp, +) + + +class FlipagramIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'https://flipagram.com/f/nyvTSJMKId', +        'md5': '888dcf08b7ea671381f00fab74692755', +        'info_dict': { +            'id': 'nyvTSJMKId', +            'ext': 'mp4', +            'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', +            'description': 'md5:d55e32edc55261cae96a41fa85ff630e', +            'duration': 35.571, +            'timestamp': 1461244995, +            'upload_date': '20160421', +            'uploader': 'kitty juria', +            'uploader_id': 'sjuria101', +            'creator': 'kitty juria', +            'view_count': int, +            'like_count': int, +            'repost_count': int, +            'comment_count': int, +            'comments': list, +            'formats': 'mincount:2', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        video_data = self._parse_json( +            self._search_regex( +                r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), +            video_id) + +        flipagram = video_data['flipagram'] +        video = flipagram['video'] + +        json_ld = self._search_json_ld(webpage, video_id, default=False) +        title = json_ld.get('title') or flipagram['captionText'] +        description = json_ld.get('description') or flipagram.get('captionText') + +        formats = [{ +            'url': video['url'], +            'width': int_or_none(video.get('width')), +            'height': int_or_none(video.get('height')), +            'filesize': int_or_none(video_data.get('size')), +        }] + +        preview_url = try_get( +            flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) +        if preview_url: +            formats.append({ +                'url': preview_url, +                'ext': 'm4a', +                'vcodec': 'none', +            }) + +        self._sort_formats(formats) + +        counts = flipagram.get('counts', {}) +        user = flipagram.get('user', {}) +        video_data = flipagram.get('video', {}) + +        thumbnails = [{ +            'url': self._proto_relative_url(cover['url']), +            'width': int_or_none(cover.get('width')), +            'height': int_or_none(cover.get('height')), +            'filesize': int_or_none(cover.get('size')), +        } for cover in flipagram.get('covers', []) if cover.get('url')] + +        # Note that this only retrieves comments that are initally loaded. +        # For videos with large amounts of comments, most won't be retrieved. +        comments = [] +        for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): +            text = comment.get('comment') +            if not text or not isinstance(text, list): +                continue +            comments.append({ +                'author': comment.get('user', {}).get('name'), +                'author_id': comment.get('user', {}).get('username'), +                'id': comment.get('id'), +                'text': text[0], +                'timestamp': unified_timestamp(comment.get('created')), +            }) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': float_or_none(flipagram.get('duration'), 1000), +            'thumbnails': thumbnails, +            'timestamp': unified_timestamp(flipagram.get('iso8601Created')), +            'uploader': user.get('name'), +            'uploader_id': user.get('username'), +            'creator': user.get('name'), +            'view_count': int_or_none(counts.get('plays')), +            'like_count': int_or_none(counts.get('likes')), +            'repost_count': int_or_none(counts.get('reflips')), +            'comment_count': int_or_none(counts.get('comments')), +            'comments': comments, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ad94e31f3..7653975e3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -14,7 +14,10 @@ from ..utils import (      parse_duration,      determine_ext,  ) -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( +    DailymotionIE, +    DailymotionCloudIE, +)  class FranceTVBaseInfoExtractor(InfoExtractor): @@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):          'params': {              'skip_download': True,          }, +    }, { +        # Dailymotion embed +        'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', +        'md5': 'ee7f1828f25a648addc90cb2687b1f12', +        'info_dict': { +            'id': 'x4iiko0', +            'ext': 'mp4', +            'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', +            'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', +            'timestamp': 1467011958, +            'upload_date': '20160627', +            'uploader': 'France Inter', +            'uploader_id': 'x2q2ez', +        }, +        'add_ie': ['Dailymotion'],      }]      def _real_extract(self, url): @@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):          dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)          if dmcloud_url: -            return self.url_result(dmcloud_url, 'DailymotionCloud') +            return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) + +        dailymotion_urls = DailymotionIE._extract_urls(webpage) +        if dailymotion_urls: +            return self.playlist_result([ +                self.url_result(dailymotion_url, DailymotionIE.ie_key()) +                for dailymotion_url in dailymotion_urls])          video_id, catalogue = self._search_regex(              (r'id-video=([^@]+@[^"]+)', diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py new file mode 100644 index 000000000..b4ab4cbb7 --- /dev/null +++ b/youtube_dl/extractor/fusion.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class FusionIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', +        'info_dict': { +            'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', +            'ext': 'mp4', +            'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', +            'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', +            'duration': 140.0, +        }, +        'params': { +            'skip_download': True, +        }, +        'add_ie': ['Ooyala'], +    }, { +        'url': 'http://fusion.net/video/201781', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) + +        ooyala_code = self._search_regex( +            r'data-video-id=(["\'])(?P<code>.+?)\1', +            webpage, 'ooyala code', group='code') + +        return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py deleted file mode 100644 index 1e7948ab8..000000000 --- a/youtube_dl/extractor/gametrailers.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( -    int_or_none, -    parse_age_limit, -    url_basename, -) - - -class GametrailersIE(InfoExtractor): -    _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' - -    _TEST = { -        'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', -        'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', -        'info_dict': { -            'id': '2983958', -            'ext': 'mp4', -            'display_id': '116437-Just-Cause-3-Review', -            'title': 'Just Cause 3 - Review', -            'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', -        }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) -        title = self._html_search_regex( -            r'<title>(.+?)\|', webpage, 'title').strip() -        embed_url = self._proto_relative_url( -            self._search_regex( -                r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, -                'embed url'), -            scheme='http:') -        video_id = url_basename(embed_url) -        embed_page = self._download_webpage(embed_url, video_id) -        embed_vars_json = self._search_regex( -            r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, -            'embed vars') -        info = self._parse_json(embed_vars_json, video_id) - -        formats = [] -        for media in info['media']: -            if media['mediaPurpose'] == 'play': -                formats.append({ -                    'url': media['uri'], -                    'height': media['height'], -                    'width:': media['width'], -                }) -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'display_id': display_id, -            'title': title, -            'formats': formats, -            'thumbnail': info.get('thumbUri'), -            'description': self._og_search_description(webpage), -            'duration': int_or_none(info.get('videoLengthInSeconds')), -            'age_limit': parse_age_limit(info.get('audienceRating')), -        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa24061c..cddd1a817 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -49,7 +49,10 @@ from .pornhub import PornHubIE  from .xhamster import XHamsterEmbedIE  from .tnaflix import TNAFlixNetworkEmbedIE  from .vimeo import VimeoIE -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( +    DailymotionIE, +    DailymotionCloudIE, +)  from .onionstudios import OnionStudiosIE  from .viewlift import ViewLiftEmbedIE  from .screenwavemedia import ScreenwaveMediaIE @@ -64,6 +67,9 @@ from .liveleak import LiveLeakIE  from .threeqsdn import ThreeQSDNIE  from .theplatform import ThePlatformIE  from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE  class GenericIE(InfoExtractor): @@ -920,6 +926,24 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Kaltura'],          }, +        { +            # Kaltura embedded via quoted entry_id +            'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', +            'info_dict': { +                'id': '0_utuok90b', +                'ext': 'mp4', +                'title': '06_matthew_brender_raj_dutt', +                'timestamp': 1466638791, +                'upload_date': '20160622', +            }, +            'add_ie': ['Kaltura'], +            'expected_warnings': [ +                'Could not send HEAD request' +            ], +            'params': { +                'skip_download': True, +            } +        },          # Eagle.Platform embed (generic URL)          {              'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1115,17 @@ class GenericIE(InfoExtractor):          # Dailymotion Cloud video          {              'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', -            'md5': '49444254273501a64675a7e68c502681', +            'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',              'info_dict': { -                'id': '5585de919473990de4bee11b', +                'id': 'x2uy8t3',                  'ext': 'mp4', -                'title': 'Le débat', +                'title': 'Sauvons les abeilles ! - Le débat', +                'description': 'md5:d9082128b1c5277987825d684939ca26',                  'thumbnail': 're:^https?://.*\.jpe?g$', +                'timestamp': 1434970506, +                'upload_date': '20150622', +                'uploader': 'Public Sénat', +                'uploader_id': 'xa9gza',              }          },          # OnionStudios embed @@ -1220,6 +1249,102 @@ class GenericIE(InfoExtractor):                  'uploader': 'www.hudl.com',              },          }, +        # twitter:player embed +        { +            'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', +            'md5': 'a3e0df96369831de324f0778e126653c', +            'info_dict': { +                'id': '4909620399001', +                'ext': 'mp4', +                'title': 'What Do Black Holes Sound Like?', +                'description': 'what do black holes sound like', +                'upload_date': '20160524', +                'uploader_id': '29913724001', +                'timestamp': 1464107587, +                'uploader': 'TheAtlantic', +            }, +            'add_ie': ['BrightcoveLegacy'], +        }, +        # Facebook <iframe> embed +        { +            'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', +            'md5': 'fbcde74f534176ecb015849146dd3aee', +            'info_dict': { +                'id': '599637780109885', +                'ext': 'mp4', +                'title': 'Facebook video #599637780109885', +            }, +        }, +        # Facebook API embed +        { +            'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', +            'md5': 'a47372ee61b39a7b90287094d447d94e', +            'info_dict': { +                'id': '10153467542406923', +                'ext': 'mp4', +                'title': 'Facebook video #10153467542406923', +            }, +        }, +        # Wordpress "YouTube Video Importer" plugin +        { +            'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', +            'md5': 'd16797741b560b485194eddda8121b48', +            'info_dict': { +                'id': 'HNTXWDXV9Is', +                'ext': 'mp4', +                'title': 'Blue Devils Drumline Stanford lot 2016', +                'upload_date': '20160627', +                'uploader_id': 'GENOCIDE8GENERAL10', +                'uploader': 'cylus cyrus', +            }, +        }, +        { +            # video stored on custom kaltura server +            'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', +            'md5': '537617d06e64dfed891fa1593c4b30cc', +            'info_dict': { +                'id': '0_1iotm5bh', +                'ext': 'mp4', +                'title': 'Elecciones británicas: 5 lecciones para Rajoy', +                'description': 'md5:435a89d68b9760b92ce67ed227055f16', +                'uploader_id': 'videos.expansion@el-mundo.net', +                'upload_date': '20150429', +                'timestamp': 1430303472, +            }, +            'add_ie': ['Kaltura'], +        }, +        { +            # Non-standard Vimeo embed +            'url': 'https://openclassrooms.com/courses/understanding-the-web', +            'md5': '64d86f1c7d369afd9a78b38cbb88d80a', +            'info_dict': { +                'id': '148867247', +                'ext': 'mp4', +                'title': 'Understanding the web - Teaser', +                'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', +                'upload_date': '20151214', +                'uploader': 'OpenClassrooms', +                'uploader_id': 'openclassrooms', +            }, +            'add_ie': ['Vimeo'], +        }, +        # { +        #     # TODO: find another test +        #     # http://schema.org/VideoObject +        #     'url': 'https://flipagram.com/f/nyvTSJMKId', +        #     'md5': '888dcf08b7ea671381f00fab74692755', +        #     'info_dict': { +        #         'id': 'nyvTSJMKId', +        #         'ext': 'mp4', +        #         'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', +        #         'description': '#love for cats.', +        #         'timestamp': 1461244995, +        #         'upload_date': '20160421', +        #     }, +        #     'params': { +        #         'force_generic_extractor': True, +        #     }, +        # }      ]      def report_following_redirect(self, new_url): @@ -1576,12 +1701,16 @@ class GenericIE(InfoExtractor):          if matches:              return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) -        # Look for embedded Dailymotion player -        matches = re.findall( -            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) +        # Look for Wordpress "YouTube Video Importer" plugin +        matches = re.findall(r'''(?x)<div[^>]+ +            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ +            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)          if matches: -            return _playlist_from_matches( -                matches, lambda m: unescapeHTML(m[1])) +            return _playlist_from_matches(matches, lambda m: m[-1]) + +        matches = DailymotionIE._extract_urls(webpage) +        if matches: +            return _playlist_from_matches(matches)          # Look for embedded Dailymotion playlist player (#3822)          m = re.search( @@ -1718,10 +1847,9 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'))          # Look for embedded Facebook player -        mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) -        if mobj is not None: -            return self.url_result(mobj.group('url'), 'Facebook') +        facebook_url = FacebookIE._extract_url(webpage) +        if facebook_url is not None: +            return self.url_result(facebook_url, 'Facebook')          # Look for embedded VK player          mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -1903,18 +2031,14 @@ class GenericIE(InfoExtractor):              return self.url_result(mobj.group('url'), 'Zapiks')          # Look for Kaltura embeds -        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or -                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) -        if mobj is not None: -            return self.url_result(smuggle_url( -                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), -                {'source_url': url}), 'Kaltura') +        kaltura_url = KalturaIE._extract_url(webpage) +        if kaltura_url: +            return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())          # Look for Eagle.Platform embeds -        mobj = re.search( -            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) -        if mobj is not None: -            return self.url_result(mobj.group('url'), 'EaglePlatform') +        eagleplatform_url = EaglePlatformIE._extract_url(webpage) +        if eagleplatform_url: +            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())          # Look for ClipYou (uses Eagle.Platform) embeds          mobj = re.search( @@ -2060,6 +2184,24 @@ class GenericIE(InfoExtractor):                  'uploader': video_uploader,              } +        # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser +        embed_url = self._html_search_meta('twitter:player', webpage, default=None) +        if embed_url: +            return self.url_result(embed_url) + +        # Looking for http://schema.org/VideoObject +        json_ld = self._search_json_ld( +            webpage, video_id, default=None, expected_type='VideoObject') +        if json_ld and json_ld.get('url'): +            info_dict.update({ +                'title': video_title or info_dict['title'], +                'description': video_description, +                'thumbnail': video_thumbnail, +                'age_limit': age_limit +            }) +            info_dict.update(json_ld) +            return info_dict +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py new file mode 100644 index 000000000..656ce6d05 --- /dev/null +++ b/youtube_dl/extractor/hrti.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( +    clean_html, +    ExtractorError, +    int_or_none, +    parse_age_limit, +    sanitized_Request, +    try_get, +) + + +class HRTiBaseIE(InfoExtractor): +    """ +        Base Information Extractor for Croatian Radiotelevision +        video on demand site https://hrti.hrt.hr +        Reverse engineered from the JavaScript app in app.min.js +    """ +    _NETRC_MACHINE = 'hrti' + +    _APP_LANGUAGE = 'hr' +    _APP_VERSION = '1.1' +    _APP_PUBLICATION_ID = 'all_in_one' +    _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + +    def _initialize_api(self): +        init_data = { +            'application_publication_id': self._APP_PUBLICATION_ID +        } + +        uuid = self._download_json( +            self._API_URL, None, note='Downloading uuid', +            errnote='Unable to download uuid', +            data=json.dumps(init_data).encode('utf-8'))['uuid'] + +        app_data = { +            'uuid': uuid, +            'application_publication_id': self._APP_PUBLICATION_ID, +            'application_version': self._APP_VERSION +        } + +        req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) +        req.get_method = lambda: 'PUT' + +        resources = self._download_json( +            req, None, note='Downloading session information', +            errnote='Unable to download session information') + +        self._session_id = resources['session_id'] + +        modules = resources['modules'] + +        self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( +            language=self._APP_LANGUAGE, +            application_id=self._APP_PUBLICATION_ID) + +        self._login_url = (modules['user']['resources']['login']['uri'] + +                           '/format/json').format(session_id=self._session_id) + +        self._logout_url = modules['user']['resources']['logout']['uri'] + +    def _login(self): +        (username, password) = self._get_login_info() +        # TODO: figure out authentication with cookies +        if username is None or password is None: +            self.raise_login_required() + +        auth_data = { +            'username': username, +            'password': password, +        } + +        try: +            auth_info = self._download_json( +                self._login_url, None, note='Logging in', errnote='Unable to log in', +                data=json.dumps(auth_data).encode('utf-8')) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: +                auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) +            else: +                raise + +        error_message = auth_info.get('error', {}).get('message') +        if error_message: +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, error_message), +                expected=True) + +        self._token = auth_info['secure_streaming_token'] + +    def _real_initialize(self): +        self._initialize_api() +        self._login() + + +class HRTiIE(HRTiBaseIE): +    _VALID_URL = r'''(?x) +                        (?: +                            hrti:(?P<short_id>[0-9]+)| +                            https?:// +                                hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? +                        ) +                    ''' +    _TESTS = [{ +        'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', +        'info_dict': { +            'id': '2181385', +            'display_id': 'republika-dokumentarna-serija-16-hd', +            'ext': 'mp4', +            'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', +            'description': 'md5:48af85f620e8e0e1df4096270568544f', +            'duration': 2922, +            'view_count': int, +            'average_rating': int, +            'episode_number': int, +            'season_number': int, +            'age_limit': 12, +        }, +        'skip': 'Requires account credentials', +    }, { +        'url': 'https://hrti.hrt.hr/#/video/show/2181385/', +        'only_matching': True, +    }, { +        'url': 'hrti:2181385', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('short_id') or mobj.group('id') +        display_id = mobj.group('display_id') or video_id + +        video = self._download_json( +            '%s/video_id/%s/format/json' % (self._search_url, video_id), +            display_id, 'Downloading video metadata JSON')['video'][0] + +        title_info = video['title'] +        title = title_info['title_long'] + +        movie = video['video_assets']['movie'][0] +        m3u8_url = movie['url'].format(TOKEN=self._token) +        formats = self._extract_m3u8_formats( +            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', +            m3u8_id='hls') +        self._sort_formats(formats) + +        description = clean_html(title_info.get('summary_long')) +        age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) +        view_count = int_or_none(video.get('views')) +        average_rating = int_or_none(video.get('user_rating')) +        duration = int_or_none(movie.get('duration')) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'duration': duration, +            'view_count': view_count, +            'average_rating': average_rating, +            'age_limit': age_limit, +            'formats': formats, +        } + + +class HRTiPlaylistIE(HRTiBaseIE): +    _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' +    _TESTS = [{ +        'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', +        'info_dict': { +            'id': '212', +            'title': 'ekumena', +        }, +        'playlist_mincount': 8, +        'skip': 'Requires account credentials', +    }, { +        'url': 'https://hrti.hrt.hr/#/video/list/category/212/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        category_id = mobj.group('id') +        display_id = mobj.group('display_id') or category_id + +        response = self._download_json( +            '%s/category_id/%s/format/json' % (self._search_url, category_id), +            display_id, 'Downloading video metadata JSON') + +        video_ids = try_get( +            response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], +            list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + +        entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + +        return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c916..01c7b3042 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,28 +3,22 @@ from __future__ import unicode_literals  import hashlib  import itertools -import math -import os -import random  import re  import time -import uuid  from .common import InfoExtractor  from ..compat import ( -    compat_parse_qs,      compat_str,      compat_urllib_parse_urlencode, -    compat_urllib_parse_urlparse,  )  from ..utils import ( +    clean_html,      decode_packed_codes, +    get_element_by_id, +    get_element_by_attribute,      ExtractorError,      ohdave_rsa_encrypt,      remove_start, -    sanitized_Request, -    urlencode_postdata, -    url_basename,  ) @@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.iqiyi.com/v_19rrojlavg.html', -        'md5': '2cb594dc2781e6c941a110d8f358118b', +        # MD5 checksum differs on my machine and Travis CI          'info_dict': {              'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', +            'ext': 'mp4',              'title': '美国德州空中惊现奇异云团 酷似UFO', -            'ext': 'f4v',          }      }, {          'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', +        'md5': '667171934041350c5de3f5015f7f1152',          'info_dict': {              'id': 'e3f585b550a280af23c98b6cb2be19fb', -            'title': '名侦探柯南第752集', -        }, -        'playlist': [{ -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }, { -            'info_dict': { -                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', -                'ext': 'f4v', -                'title': '名侦探柯南第752集', -            }, -        }], -        'params': { -            'skip_download': True, +            'ext': 'mp4', +            'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇',          }, +        'skip': 'Geo-restricted to China',      }, {          'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',          'only_matching': True, @@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor):          'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',          'info_dict': {              'id': 'f3cf468b39dddb30d676f89a91200dc1', +            'ext': 'mp4',              'title': '泰坦尼克号',          }, -        'playlist': [{ -            'info_dict': { -                'id': 'f3cf468b39dddb30d676f89a91200dc1_part1', -                'ext': 'f4v', -                'title': '泰坦尼克号', -            }, -        }, { -            'info_dict': { -                'id': 'f3cf468b39dddb30d676f89a91200dc1_part2', -                'ext': 'f4v', -                'title': '泰坦尼克号', -            }, -        }], -        'expected_warnings': ['Needs a VIP account for full video'], +        'skip': 'Geo-restricted to China',      }, {          'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',          'info_dict': { @@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor):          'only_matching': True,      }] -    _FORMATS_MAP = [ -        ('1', 'h6'), -        ('2', 'h5'), -        ('3', 'h4'), -        ('4', 'h3'), -        ('5', 'h2'), -        ('10', 'h1'), -    ] - -    AUTH_API_ERRORS = { -        # No preview available (不允许试看鉴权失败) -        'Q00505': 'This video requires a VIP account', -        # End of preview time (试看结束鉴权失败) -        'Q00506': 'Needs a VIP account for full video', +    _FORMATS_MAP = { +        '96': 1,    # 216p, 240p +        '1': 2,     # 336p, 360p +        '2': 3,     # 480p, 504p +        '21': 4,    # 504p +        '4': 5,     # 720p +        '17': 5,    # 720p +        '5': 6,     # 1072p, 1080p +        '18': 7,    # 1080p      }      def _real_initialize(self): @@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor):          return True -    def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): -        auth_params = { -            # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as -            'version': '2.0', -            'platform': 'b6c13e26323c537d', -            'aid': tvid, -            'tvid': tvid, -            'uid': '', -            'deviceId': _uuid, -            'playType': 'main',  # XXX: always main? -            'filename': os.path.splitext(url_basename(api_video_url))[0], -        } +    def get_raw_data(self, tvid, video_id): +        tm = int(time.time() * 1000) -        qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) -        for key, val in qd_items.items(): -            auth_params[key] = val[0] - -        auth_req = sanitized_Request( -            'http://api.vip.iqiyi.com/services/ckn.action', -            urlencode_postdata(auth_params)) -        # iQiyi server throws HTTP 405 error without the following header -        auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') -        auth_result = self._download_json( -            auth_req, video_id, -            note='Downloading video authentication JSON', -            errnote='Unable to download video authentication JSON') - -        code = auth_result.get('code') -        msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code -        if code == 'Q00506': -            if do_report_warning: -                self.report_warning(msg) -            return False -        if 'data' not in auth_result: -            if msg is not None: -                raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) -            raise ExtractorError('Unexpected error from Iqiyi auth API') - -        return auth_result['data'] - -    def construct_video_urls(self, data, video_id, _uuid, tvid): -        def do_xor(x, y): -            a = y % 3 -            if a == 1: -                return x ^ 121 -            if a == 2: -                return x ^ 72 -            return x ^ 103 - -        def get_encode_code(l): -            a = 0 -            b = l.split('-') -            c = len(b) -            s = '' -            for i in range(c - 1, -1, -1): -                a = do_xor(int(b[c - i - 1], 16), i) -                s += chr(a) -            return s[::-1] - -        def get_path_key(x, format_id, segment_index): -            mg = ')(*&^flash@#$%a' -            tm = self._download_json( -                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, -                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) -            )['t'] -            t = str(int(math.floor(int(tm) / (600.0)))) -            return md5_text(t + mg + x) - -        video_urls_dict = {} -        need_vip_warning_report = True -        for format_item in data['vp']['tkl'][0]['vs']: -            if 0 < int(format_item['bid']) <= 10: -                format_id = self.get_format(format_item['bid']) -            else: -                continue - -            video_urls = [] - -            video_urls_info = format_item['fs'] -            if not format_item['fs'][0]['l'].startswith('/'): -                t = get_encode_code(format_item['fs'][0]['l']) -                if t.endswith('mp4'): -                    video_urls_info = format_item['flvs'] - -            for segment_index, segment in enumerate(video_urls_info): -                vl = segment['l'] -                if not vl.startswith('/'): -                    vl = get_encode_code(vl) -                is_vip_video = '/vip/' in vl -                filesize = segment['b'] -                base_url = data['vp']['du'].split('/') -                if not is_vip_video: -                    key = get_path_key( -                        vl.split('/')[-1].split('.')[0], format_id, segment_index) -                    base_url.insert(-1, key) -                base_url = '/'.join(base_url) -                param = { -                    'su': _uuid, -                    'qyid': uuid.uuid4().hex, -                    'client': '', -                    'z': '', -                    'bt': '', -                    'ct': '', -                    'tn': str(int(time.time())) -                } -                api_video_url = base_url + vl -                if is_vip_video: -                    api_video_url = api_video_url.replace('.f4v', '.hml') -                    auth_result = self._authenticate_vip_video( -                        api_video_url, video_id, tvid, _uuid, need_vip_warning_report) -                    if auth_result is False: -                        need_vip_warning_report = False -                        break -                    param.update({ -                        't': auth_result['t'], -                        # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as -                        'cid': 'afbe8fd3d73448c9', -                        'vid': video_id, -                        'QY00001': auth_result['u'], -                    }) -                api_video_url += '?' if '?' not in api_video_url else '&' -                api_video_url += compat_urllib_parse_urlencode(param) -                js = self._download_json( -                    api_video_url, video_id, -                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) -                video_url = js['l'] -                video_urls.append( -                    (video_url, filesize)) - -            video_urls_dict[format_id] = video_urls -        return video_urls_dict - -    def get_format(self, bid): -        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] -        return matched_format_ids[0] if len(matched_format_ids) else None - -    def get_bid(self, format_id): -        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] -        return matched_bids[0] if len(matched_bids) else None - -    def get_raw_data(self, tvid, video_id, enc_key, _uuid): -        tm = str(int(time.time())) -        tail = tm + tvid -        param = { -            'key': 'fvip', -            'src': md5_text('youtube-dl'), -            'tvId': tvid, +        key = 'd5fb4bd9d50c4be6948c97edd7254b0e' +        sc = md5_text(compat_str(tm) + key + tvid) +        params = { +            'tvid': tvid,              'vid': video_id, -            'vinfo': 1, -            'tm': tm, -            'enc': md5_text(enc_key + tail), -            'qyid': _uuid, -            'tn': random.random(), -            # In iQiyi's flash player, um is set to 1 if there's a logged user -            # Some 1080P formats are only available with a logged user. -            # Here force um=1 to trick the iQiyi server -            'um': 1, -            'authkey': md5_text(md5_text('') + tail), -            'k_tag': 1, +            'src': '76f90cbd92f94a2e925d83e8ccd22cb7', +            'sc': sc, +            't': tm,          } -        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ -            compat_urllib_parse_urlencode(param) -        raw_data = self._download_json(api_url, video_id) -        return raw_data - -    def get_enc_key(self, video_id): -        # TODO: automatic key extraction -        # last update at 2016-01-22 for Zombie::bite -        enc_key = '4a1caba4b4465345366f28da7c117d20' -        return enc_key +        return self._download_json( +            'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), +            video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), +            query=params, headers=self.geo_verification_headers())      def _extract_playlist(self, webpage):          PAGE_SIZE = 50 @@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor):              r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')          video_id = self._search_regex(              r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') -        _uuid = uuid.uuid4().hex - -        enc_key = self.get_enc_key(video_id) - -        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) - -        if raw_data['code'] != 'A000000': -            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - -        data = raw_data['data'] - -        title = data['vi']['vn'] - -        # generate video_urls_dict -        video_urls_dict = self.construct_video_urls( -            data, video_id, _uuid, tvid) - -        # construct info -        entries = [] -        for format_id in video_urls_dict: -            video_urls = video_urls_dict[format_id] -            for i, video_url_info in enumerate(video_urls): -                if len(entries) < i + 1: -                    entries.append({'formats': []}) -                entries[i]['formats'].append( -                    { -                        'url': video_url_info[0], -                        'filesize': video_url_info[-1], -                        'format_id': format_id, -                        'preference': int(self.get_bid(format_id)) -                    } -                ) - -        for i in range(len(entries)): -            self._sort_formats(entries[i]['formats']) -            entries[i].update( -                { -                    'id': '%s_part%d' % (video_id, i + 1), -                    'title': title, -                } -            ) - -        if len(entries) > 1: -            info = { -                '_type': 'multi_video', -                'id': video_id, -                'title': title, -                'entries': entries, -            } -        else: -            info = entries[0] -            info['id'] = video_id -            info['title'] = title - -        return info + +        formats = [] +        for _ in range(5): +            raw_data = self.get_raw_data(tvid, video_id) + +            if raw_data['code'] != 'A00000': +                if raw_data['code'] == 'A00111': +                    self.raise_geo_restricted() +                raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + +            data = raw_data['data'] + +            for stream in data['vidl']: +                if 'm3utx' not in stream: +                    continue +                vd = compat_str(stream['vd']) +                formats.append({ +                    'url': stream['m3utx'], +                    'format_id': vd, +                    'ext': 'mp4', +                    'preference': self._FORMATS_MAP.get(vd, -1), +                    'protocol': 'm3u8_native', +                }) + +            if formats: +                break + +            self._sleep(5, video_id) + +        self._sort_formats(formats) +        title = (get_element_by_id('widget-videotitle', webpage) or +                 clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697ff5..1729f5bfb 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,6 @@ import base64  from .common import InfoExtractor  from ..compat import ( -    compat_urllib_parse_urlencode,      compat_urlparse,      compat_parse_qs,  ) @@ -15,6 +14,7 @@ from ..utils import (      ExtractorError,      int_or_none,      unsmuggle_url, +    smuggle_url,  ) @@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor):                          )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?                  )                  ''' -    _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' +    _SERVICE_URL = 'http://cdnapi.kaltura.com' +    _SERVICE_BASE = '/api_v3/index.php'      _TESTS = [          {              'url': 'kaltura:269692:1_1jc2y3e4', @@ -64,16 +65,50 @@ class KalturaIE(InfoExtractor):          }      ] -    def _kaltura_api_call(self, video_id, actions, *args, **kwargs): +    @staticmethod +    def _extract_url(webpage): +        mobj = ( +            re.search( +                r"""(?xs) +                    kWidget\.(?:thumb)?[Ee]mbed\( +                    \{.*? +                        (?P<q1>['\"])wid(?P=q1)\s*:\s* +                        (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? +                        (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* +                        (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), +                """, webpage) or +            re.search( +                r'''(?xs) +                    (?P<q1>["\']) +                        (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? +                    (?P=q1).*? +                    (?: +                        entry_?[Ii]d| +                        (?P<q2>["\'])entry_?[Ii]d(?P=q2) +                    )\s*:\s* +                    (?P<q3>["\'])(?P<id>.+?)(?P=q3) +                ''', webpage)) +        if mobj: +            embed_info = mobj.groupdict() +            url = 'kaltura:%(partner_id)s:%(id)s' % embed_info +            escaped_pid = re.escape(embed_info['partner_id']) +            service_url = re.search( +                r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), +                webpage) +            if service_url: +                url = smuggle_url(url, {'service_url': service_url.group(1)}) +            return url + +    def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):          params = actions[0]          if len(actions) > 1:              for i, a in enumerate(actions[1:], start=1):                  for k, v in a.items():                      params['%d:%s' % (i, k)] = v -        query = compat_urllib_parse_urlencode(params) -        url = self._API_BASE + query -        data = self._download_json(url, video_id, *args, **kwargs) +        data = self._download_json( +            (service_url or self._SERVICE_URL) + self._SERVICE_BASE, +            video_id, query=params, *args, **kwargs)          status = data if len(actions) == 1 else data[0]          if status.get('objectType') == 'KalturaAPIException': @@ -82,7 +117,7 @@ class KalturaIE(InfoExtractor):          return data -    def _get_kaltura_signature(self, video_id, partner_id): +    def _get_kaltura_signature(self, video_id, partner_id, service_url=None):          actions = [{              'apiVersion': '3.1',              'expiry': 86400, @@ -92,10 +127,10 @@ class KalturaIE(InfoExtractor):              'widgetId': '_%s' % partner_id,          }]          return self._kaltura_api_call( -            video_id, actions, note='Downloading Kaltura signature')['ks'] +            video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] -    def _get_video_info(self, video_id, partner_id): -        signature = self._get_kaltura_signature(video_id, partner_id) +    def _get_video_info(self, video_id, partner_id, service_url=None): +        signature = self._get_kaltura_signature(video_id, partner_id, service_url)          actions = [              {                  'action': 'null', @@ -118,7 +153,7 @@ class KalturaIE(InfoExtractor):              },          ]          return self._kaltura_api_call( -            video_id, actions, note='Downloading video info JSON') +            video_id, actions, service_url, note='Downloading video info JSON')      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -127,7 +162,7 @@ class KalturaIE(InfoExtractor):          partner_id, entry_id = mobj.group('partner_id', 'id')          ks = None          if partner_id and entry_id: -            info, flavor_assets = self._get_video_info(entry_id, partner_id) +            info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))          else:              path, query = mobj.group('path', 'query')              if not path and not query: @@ -175,12 +210,17 @@ class KalturaIE(InfoExtractor):                  unsigned_url += '?referrer=%s' % referrer              return unsigned_url +        data_url = info['dataUrl'] +        if '/flvclipper/' in data_url: +            data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) +          formats = []          for f in flavor_assets:              # Continue if asset is not ready              if f['status'] != 2:                  continue -            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) +            video_url = sign_url( +                '%s/flavorId/%s' % (data_url, f['id']))              formats.append({                  'format_id': '%(fileExt)s-%(bitrate)s' % f,                  'ext': f.get('fileExt'), @@ -193,9 +233,12 @@ class KalturaIE(InfoExtractor):                  'width': int_or_none(f.get('width')),                  'url': video_url,              }) -        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) -        formats.extend(self._extract_m3u8_formats( -            m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +        if '/playManifest/' in data_url: +            m3u8_url = sign_url(data_url.replace( +                'format/url', 'format/applehttp')) +            formats.extend(self._extract_m3u8_formats( +                m3u8_url, entry_id, 'mp4', 'm3u8_native', +                m3u8_id='hls', fatal=False))          self._check_formats(formats, entry_id)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py new file mode 100644 index 000000000..b50120d98 --- /dev/null +++ b/youtube_dl/extractor/kamcord.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    qualities, +) + + +class KamcordIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'https://www.kamcord.com/v/hNYRduDgWb4', +        'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', +        'info_dict': { +            'id': 'hNYRduDgWb4', +            'ext': 'mp4', +            'title': 'Drinking Madness', +            'uploader': 'jacksfilms', +            'uploader_id': '3044562', +            'view_count': int, +            'like_count': int, +            'comment_count': int, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video = self._parse_json( +            self._search_regex( +                r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', +                webpage, 'video'), +            video_id)['video'] + +        title = video['title'] + +        formats = self._extract_m3u8_formats( +            video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') +        self._sort_formats(formats) + +        uploader = video.get('user', {}).get('username') +        uploader_id = video.get('user', {}).get('id') + +        view_count = int_or_none(video.get('viewCount')) +        like_count = int_or_none(video.get('heartCount')) +        comment_count = int_or_none(video.get('messageCount')) + +        preference_key = qualities(('small', 'medium', 'large')) + +        thumbnails = [{ +            'url': thumbnail_url, +            'id': thumbnail_id, +            'preference': preference_key(thumbnail_id), +        } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() +            if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] + +        return { +            'id': video_id, +            'title': title, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'view_count': view_count, +            'like_count': like_count, +            'comment_count': comment_count, +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0221fb919..b1d460599 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -26,11 +26,6 @@ class KuwoBaseIE(InfoExtractor):      def _get_formats(self, song_id, tolerate_ip_deny=False):          formats = []          for file_format in self._FORMATS: -            headers = {} -            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') -            if cn_verification_proxy: -                headers['Ytdl-request-proxy'] = cn_verification_proxy -              query = {                  'format': file_format['ext'],                  'br': file_format.get('br', ''), @@ -42,7 +37,7 @@ class KuwoBaseIE(InfoExtractor):              song_url = self._download_webpage(                  'http://antiserver.kuwo.cn/anti.s',                  song_id, note='Download %s url info' % file_format['format'], -                query=query, headers=headers, +                query=query, headers=self.geo_verification_headers(),              )              if song_url == 'IPDeny' and not tolerate_ip_deny: diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index b08f6e3c9..da5a5de4a 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -1,60 +1,65 @@ +# coding: utf-8  from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import ( -    parse_duration, +    js_to_json, +    smuggle_url,  )  class LA7IE(InfoExtractor): -    IE_NAME = 'la7.tv' -    _VALID_URL = r'''(?x) -        https?://(?:www\.)?la7\.tv/ -        (?: -            richplayer/\?assetid=| -            \?contentId= -        ) -        (?P<id>[0-9]+)''' - -    _TEST = { -        'url': 'http://www.la7.tv/richplayer/?assetid=50355319', -        'md5': 'ec7d1f0224d20ba293ab56cf2259651f', +    IE_NAME = 'la7.it' +    _VALID_URL = r'''(?x)(https?://)?(?: +        (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| +        tg\.la7\.it/repliche-tgla7\?id= +    )(?P<id>.+)''' + +    _TESTS = [{ +        # 'src' is a plain URL +        'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', +        'md5': '8b613ffc0c4bf9b9e377169fc19c214c',          'info_dict': { -            'id': '50355319', +            'id': 'inccool8-02-10-2015-163722',              'ext': 'mp4', -            'title': 'IL DIVO', -            'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti  e Flavio Bucci', -            'duration': 6254, +            'title': 'Inc.Cool8', +            'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto  atletico', +            'thumbnail': 're:^https?://.*', +            'uploader_id': 'kdla7pillole@iltrovatore.it', +            'timestamp': 1443814869, +            'upload_date': '20151002',          }, -        'skip': 'Blocked in the US', -    } +    }, { +        # 'src' is a dictionary +        'url': 'http://tg.la7.it/repliche-tgla7?id=189080', +        'md5': '6b0d8888d286e39870208dfeceaf456b', +        'info_dict': { +            'id': '189080', +            'ext': 'mp4', +            'title': 'TG LA7', +        }, +    }, { +        'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id -        doc = self._download_xml(xml_url, video_id) - -        video_title = doc.find('title').text -        description = doc.find('description').text -        duration = parse_duration(doc.find('duration').text) -        thumbnail = doc.find('img').text -        view_count = int(doc.find('views').text) -        prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:') +        webpage = self._download_webpage(url, video_id) -        formats = [{ -            'format': vnode.find('quality').text, -            'tbr': int(vnode.find('quality').text), -            'url': vnode.find('fms').text.strip().replace('mp4:', prefix), -        } for vnode in doc.findall('.//videos/video')] -        self._sort_formats(formats) +        player_data = self._parse_json( +            self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), +            video_id, transform_source=js_to_json)          return { +            '_type': 'url_transparent', +            'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { +                'service_url': 'http://kdam.iltrovatore.it', +            }),              'id': video_id, -            'title': video_title, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, -            'formats': formats, -            'view_count': view_count, +            'title': player_data['title'], +            'description': self._og_search_description(webpage, default=None), +            'thumbnail': player_data.get('poster'), +            'ie_key': 'Kaltura',          } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581cd9..e9cc9aa59 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -20,9 +20,10 @@ from ..utils import (      int_or_none,      orderedSet,      parse_iso8601, -    sanitized_Request,      str_or_none,      url_basename, +    urshift, +    update_url_query,  ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor):          'only_matching': True,      }] -    @staticmethod -    def urshift(val, n): -        return val >> n if val >= 0 else (val + 0x100000000) >> n -      # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf      def ror(self, param1, param2):          _loc3_ = 0          while _loc3_ < param2: -            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) +            param1 = urshift(param1, 1) + ((param1 & 1) << 31)              _loc3_ += 1          return param1 @@ -93,6 +90,10 @@ class LeIE(InfoExtractor):          _loc3_ = self.ror(_loc3_, _loc2_ % 17)          return _loc3_ +    # reversed from http://jstatic.letvcdn.com/sdk/player.js +    def get_mms_key(self, time): +        return self.ror(time, 8) ^ 185025305 +      # see M3U8Encryption class in KLetvPlayer.swf      @staticmethod      def decrypt_m3u8(encrypted_data): @@ -113,28 +114,7 @@ class LeIE(InfoExtractor):          return bytes(_loc7_) -    def _real_extract(self, url): -        media_id = self._match_id(url) -        page = self._download_webpage(url, media_id) -        params = { -            'id': media_id, -            'platid': 1, -            'splatid': 101, -            'format': 1, -            'tkey': self.calc_time_key(int(time.time())), -            'domain': 'www.le.com' -        } -        play_json_req = sanitized_Request( -            'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) -        ) -        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') -        if cn_verification_proxy: -            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) - -        play_json = self._download_json( -            play_json_req, -            media_id, 'Downloading playJson data') - +    def _check_errors(self, play_json):          # Check for errors          playstatus = play_json['playstatus']          if playstatus['status'] == 0: @@ -145,43 +125,99 @@ class LeIE(InfoExtractor):                  msg = 'Generic error. flag = %d' % flag              raise ExtractorError(msg, expected=True) -        playurl = play_json['playurl'] - -        formats = ['350', '1000', '1300', '720p', '1080p'] -        dispatch = playurl['dispatch'] +    def _real_extract(self, url): +        media_id = self._match_id(url) +        page = self._download_webpage(url, media_id) -        urls = [] -        for format_id in formats: -            if format_id in dispatch: -                media_url = playurl['domain'][0] + dispatch[format_id][0] -                media_url += '&' + compat_urllib_parse_urlencode({ -                    'm3v': 1, +        play_json_h5 = self._download_json( +            'http://api.le.com/mms/out/video/playJsonH5', +            media_id, 'Downloading html5 playJson data', query={ +                'id': media_id, +                'platid': 3, +                'splatid': 304, +                'format': 1, +                'tkey': self.get_mms_key(int(time.time())), +                'domain': 'www.le.com', +                'tss': 'no', +            }, +            headers=self.geo_verification_headers()) +        self._check_errors(play_json_h5) + +        play_json_flash = self._download_json( +            'http://api.le.com/mms/out/video/playJson', +            media_id, 'Downloading flash playJson data', query={ +                'id': media_id, +                'platid': 1, +                'splatid': 101, +                'format': 1, +                'tkey': self.calc_time_key(int(time.time())), +                'domain': 'www.le.com', +            }, +            headers=self.geo_verification_headers()) +        self._check_errors(play_json_flash) + +        def get_h5_urls(media_url, format_id): +            location = self._download_json( +                media_url, media_id, +                'Download JSON metadata for format %s' % format_id, query={                      'format': 1,                      'expect': 3, -                    'rateid': format_id, -                }) +                    'tss': 'no', +                })['location'] + +            return { +                'http': update_url_query(location, {'tss': 'no'}), +                'hls': update_url_query(location, {'tss': 'ios'}), +            } -                nodes_data = self._download_json( -                    media_url, media_id, -                    'Download JSON metadata for format %s' % format_id) +        def get_flash_urls(media_url, format_id): +            media_url += '&' + compat_urllib_parse_urlencode({ +                'm3v': 1, +                'format': 1, +                'expect': 3, +                'rateid': format_id, +            }) -                req = self._request_webpage( -                    nodes_data['nodelist'][0]['location'], media_id, -                    note='Downloading m3u8 information for format %s' % format_id) +            nodes_data = self._download_json( +                media_url, media_id, +                'Download JSON metadata for format %s' % format_id) -                m3u8_data = self.decrypt_m3u8(req.read()) +            req = self._request_webpage( +                nodes_data['nodelist'][0]['location'], media_id, +                note='Downloading m3u8 information for format %s' % format_id) -                url_info_dict = { -                    'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), -                    'ext': determine_ext(dispatch[format_id][1]), -                    'format_id': format_id, -                    'protocol': 'm3u8', -                } +            m3u8_data = self.decrypt_m3u8(req.read()) -                if format_id[-1:] == 'p': -                    url_info_dict['height'] = int_or_none(format_id[:-1]) +            return { +                'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), +            } -                urls.append(url_info_dict) +        extracted_formats = [] +        formats = [] +        for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): +            playurl = play_json['playurl'] +            play_domain = playurl['domain'][0] + +            for format_id, format_data in playurl.get('dispatch', []).items(): +                if format_id in extracted_formats: +                    continue +                extracted_formats.append(format_id) + +                media_url = play_domain + format_data[0] +                for protocol, format_url in get_urls(media_url, format_id).items(): +                    f = { +                        'url': format_url, +                        'ext': determine_ext(format_data[1]), +                        'format_id': '%s-%s' % (protocol, format_id), +                        'protocol': 'm3u8_native' if protocol == 'hls' else 'http', +                        'quality': int_or_none(format_id), +                    } + +                    if format_id[-1:] == 'p': +                        f['height'] = int_or_none(format_id[:-1]) + +                    formats.append(f) +        self._sort_formats(formats, ('height', 'quality', 'format_id'))          publish_time = parse_iso8601(self._html_search_regex(              r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -190,7 +226,7 @@ class LeIE(InfoExtractor):          return {              'id': media_id, -            'formats': urls, +            'formats': formats,              'title': playurl['title'],              'thumbnail': playurl['pic'],              'description': description, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 2d5040032..a98c4c530 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE):      _TESTS = [{          'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', -        'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', +        # md5 is unstable          'info_dict': {              'id': '114408',              'ext': 'mp4', diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index d5945ad66..39d2742c8 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,8 +1,6 @@  # encoding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor @@ -23,34 +21,5 @@ class M6IE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id, -                                 'Downloading video RSS') - -        title = rss.find('./channel/item/title').text -        description = rss.find('./channel/item/description').text -        thumbnail = rss.find('./channel/item/visuel_clip_big').text -        duration = int(rss.find('./channel/item/duration').text) -        view_count = int(rss.find('./channel/item/nombre_vues').text) - -        formats = [] -        for format_id in ['lq', 'sd', 'hq', 'hd']: -            video_url = rss.find('./channel/item/url_video_%s' % format_id) -            if video_url is None: -                continue -            formats.append({ -                'url': video_url.text, -                'format_id': format_id, -            }) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, -            'view_count': view_count, -            'formats': formats, -        } +        video_id = self._match_id(url) +        return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py new file mode 100644 index 000000000..cdb46e163 --- /dev/null +++ b/youtube_dl/extractor/meta.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .pladform import PladformIE +from ..utils import ( +    unescapeHTML, +    int_or_none, +    ExtractorError, +) + + +class METAIE(InfoExtractor): +    _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://video.meta.ua/5502115.video', +        'md5': '71b6f3ee274bef16f1ab410f7f56b476', +        'info_dict': { +            'id': '5502115', +            'ext': 'mp4', +            'title': 'Sony Xperia Z camera test [HQ]', +            'description': 'Xperia Z shoots video in FullHD HDR.', +            'uploader_id': 'nomobile', +            'uploader': 'CHЁZA.TV', +            'upload_date': '20130211', +        }, +        'add_ie': ['Youtube'], +    }, { +        'url': 'http://video.meta.ua/iframe/5502115', +        'only_matching': True, +    }, { +        # pladform embed +        'url': 'http://video.meta.ua/7121015.video', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        st_html5 = self._search_regex( +            r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) + +        if st_html5: +            # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js +            json_str = '' +            for i in range(0, len(st_html5), 3): +                json_str += '�%s;' % st_html5[i:i + 3] +            uppod_data = self._parse_json(unescapeHTML(json_str), video_id) +            error = uppod_data.get('customnotfound') +            if error: +                raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + +            video_url = uppod_data['file'] +            info = { +                'id': video_id, +                'url': video_url, +                'title': uppod_data.get('comment') or self._og_search_title(webpage), +                'description': self._og_search_description(webpage, default=None), +                'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), +                'duration': int_or_none(self._og_search_property( +                    'video:duration', webpage, default=None)), +            } +            if 'youtube.com/' in video_url: +                info.update({ +                    '_type': 'url_transparent', +                    'ie_key': 'Youtube', +                }) +            return info + +        pladform_url = PladformIE._extract_url(webpage) +        if pladform_url: +            return self.url_result(pladform_url) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index b6f00cc25..e6e7659a1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -11,13 +11,14 @@ from ..utils import (      determine_ext,      ExtractorError,      int_or_none, -    sanitized_Request,      urlencode_postdata, +    get_element_by_attribute, +    mimetype2ext,  )  class MetacafeIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' +    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'      _DISCLAIMER = 'http://www.metacafe.com/family_filter/'      _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'      IE_NAME = 'metacafe' @@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor):                  'uploader': 'ign',                  'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',              }, +            'skip': 'Page is temporarily unavailable.',          },          # AnyClip video          { @@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor):                  'id': 'an-dVVXnuY7Jh77J',                  'ext': 'mp4',                  'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', -                'uploader': 'anyclip', -                'description': 'md5:38c711dd98f5bb87acf973d573442e67', +                'uploader': 'AnyClip', +                'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',              },          },          # age-restricted video @@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor):      def report_disclaimer(self):          self.to_screen('Retrieving disclaimer') -    def _real_initialize(self): +    def _confirm_age(self):          # Retrieve disclaimer          self.report_disclaimer()          self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')          # Confirm age -        disclaimer_form = { -            'filters': '0', -            'submit': "Continue - I'm over 18", -        } -        request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) -        request.add_header('Content-Type', 'application/x-www-form-urlencoded')          self.report_age_confirmation() -        self._download_webpage(request, None, False, 'Unable to confirm age') +        self._download_webpage( +            self._FILTER_POST, None, False, 'Unable to confirm age', +            data=urlencode_postdata({ +                'filters': '0', +                'submit': "Continue - I'm over 18", +            }), headers={ +                'Content-Type': 'application/x-www-form-urlencoded', +            })      def _real_extract(self, url):          # Extract id and simplified title from URL -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) - -        video_id = mobj.group(1) +        video_id, display_id = re.match(self._VALID_URL, url).groups()          # the video may come from an external site          m_external = re.match('^(\w{2})-(.*)$', video_id) @@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor):              if prefix == 'cb':                  return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') -        # Retrieve video webpage to extract further information -        req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id) +        # self._confirm_age()          # AnyClip videos require the flashversion cookie so that we get the link          # to the mp4 file -        mobj_an = re.match(r'^an-(.*?)$', video_id) -        if mobj_an: -            req.headers['Cookie'] = 'flashVersion=0;' -        webpage = self._download_webpage(req, video_id) +        headers = {} +        if video_id.startswith('an-'): +            headers['Cookie'] = 'flashVersion=0;' + +        # Retrieve video webpage to extract further information +        webpage = self._download_webpage(url, video_id, headers=headers) + +        error = get_element_by_attribute( +            'class', 'notfound-page-title', webpage) +        if error: +            raise ExtractorError(error, expected=True) + +        video_title = self._html_search_meta( +            ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')          # Extract URL, uploader and title from webpage          self.report_extraction(video_id) @@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor):                          'player_url': player_url,                          'ext': play_path.partition(':')[0],                      }) +        if video_url is None: +            flashvars = self._parse_json(self._search_regex( +                r'flashvars\s*=\s*({.*});', webpage, 'flashvars', +                default=None), video_id, fatal=False) +            if flashvars: +                video_url = [] +                for source in flashvars.get('sources'): +                    source_url = source.get('src') +                    if not source_url: +                        continue +                    ext = mimetype2ext(source.get('type')) or determine_ext(source_url) +                    if ext == 'm3u8': +                        video_url.extend(self._extract_m3u8_formats( +                            source_url, video_id, 'mp4', +                            'm3u8_native', m3u8_id='hls', fatal=False)) +                    else: +                        video_url.append({ +                            'url': source_url, +                            'ext': ext, +                        })          if video_url is None:              raise ExtractorError('Unsupported video type') -        video_title = self._html_search_regex( -            r'(?im)<title>(.*) - Video</title>', webpage, 'title') -        description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) +        description = self._html_search_meta( +            ['og:description', 'twitter:description', 'description'], +            webpage, 'title', fatal=False) +        thumbnail = self._html_search_meta( +            ['og:image', 'twitter:image'], webpage, 'title', fatal=False)          video_uploader = self._html_search_regex(              r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',              webpage, 'uploader nickname', fatal=False)          duration = int_or_none( -            self._html_search_meta('video:duration', webpage)) - +            self._html_search_meta('video:duration', webpage, default=None))          age_limit = (              18              if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) @@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor):                  'url': video_url,                  'ext': video_ext,              }] -          self._sort_formats(formats) +          return {              'id': video_id, +            'display_id': display_id,              'description': description,              'uploader': video_uploader,              'title': video_title, diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 9fbc74f5d..d970e94ec 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -26,7 +26,8 @@ class MGTVIE(InfoExtractor):          video_id = self._match_id(url)          api_data = self._download_json(              'http://v.api.mgtv.com/player/video', video_id, -            query={'video_id': video_id})['data'] +            query={'video_id': video_id}, +            headers=self.geo_verification_headers())['data']          info = api_data['info']          formats = [] diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 5a00cd397..cd169f361 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -12,12 +12,69 @@ from ..utils import (      get_element_by_attribute,      int_or_none,      remove_start, +    extract_attributes, +    determine_ext,  ) -class MiTeleIE(InfoExtractor): +class MiTeleBaseIE(InfoExtractor): +    def _get_player_info(self, url, webpage): +        player_data = extract_attributes(self._search_regex( +            r'(?s)(<ms-video-player.+?</ms-video-player>)', +            webpage, 'ms video player')) +        video_id = player_data['data-media-id'] +        config_url = compat_urlparse.urljoin(url, player_data['data-config']) +        config = self._download_json( +            config_url, video_id, 'Downloading config JSON') +        mmc_url = config['services']['mmc'] + +        duration = None +        formats = [] +        for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): +            mmc = self._download_json( +                m_url, video_id, 'Downloading mmc JSON') +            if not duration: +                duration = int_or_none(mmc.get('duration')) +            for location in mmc['locations']: +                gat = self._proto_relative_url(location.get('gat'), 'http:') +                bas = location.get('bas') +                loc = location.get('loc') +                ogn = location.get('ogn') +                if None in (gat, bas, loc, ogn): +                    continue +                token_data = { +                    'bas': bas, +                    'icd': loc, +                    'ogn': ogn, +                    'sta': '0', +                } +                media = self._download_json( +                    '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), +                    video_id, 'Downloading %s JSON' % location['loc']) +                file_ = media.get('file') +                if not file_: +                    continue +                ext = determine_ext(file_) +                if ext == 'f4m': +                    formats.extend(self._extract_f4m_formats( +                        file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', +                        video_id, f4m_id='hds', fatal=False)) +                elif ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'formats': formats, +            'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), +            'duration': duration, +        } + + +class MiTeleIE(MiTeleBaseIE):      IE_DESC = 'mitele.es' -    _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' +    _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/'      _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', @@ -25,7 +82,7 @@ class MiTeleIE(InfoExtractor):          'info_dict': {              'id': '0NF1jJnxS1Wu3pHrmvFyw2',              'display_id': 'programa-144', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Tor, la web invisible',              'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',              'series': 'Diario de', @@ -40,7 +97,7 @@ class MiTeleIE(InfoExtractor):          'info_dict': {              'id': 'eLZSwoEd1S3pVyUm8lc6F',              'display_id': 'programa-226', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Cuarto Milenio - Temporada 6 - Programa 226',              'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',              'series': 'Cuarto Milenio', @@ -59,40 +116,7 @@ class MiTeleIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        config_url = self._search_regex( -            r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') -        config_url = compat_urlparse.urljoin(url, config_url) - -        config = self._download_json( -            config_url, display_id, 'Downloading config JSON') - -        mmc = self._download_json( -            config['services']['mmc'], display_id, 'Downloading mmc JSON') - -        formats = [] -        for location in mmc['locations']: -            gat = self._proto_relative_url(location.get('gat'), 'http:') -            bas = location.get('bas') -            loc = location.get('loc') -            ogn = location.get('ogn') -            if None in (gat, bas, loc, ogn): -                continue -            token_data = { -                'bas': bas, -                'icd': loc, -                'ogn': ogn, -                'sta': '0', -            } -            media = self._download_json( -                '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), -                display_id, 'Downloading %s JSON' % location['loc']) -            file_ = media.get('file') -            if not file_: -                continue -            formats.extend(self._extract_f4m_formats( -                file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', -                display_id, f4m_id=loc)) -        self._sort_formats(formats) +        info = self._get_player_info(url, webpage)          title = self._search_regex(              r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', @@ -112,21 +136,12 @@ class MiTeleIE(InfoExtractor):                  title = remove_start(self._search_regex(                      r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ') -        video_id = self._search_regex( -            r'data-media-id\s*=\s*"([^"]+)"', webpage, -            'data media id', default=None) or display_id -        thumbnail = config.get('poster', {}).get('imageUrl') -        duration = int_or_none(mmc.get('duration')) - -        return { -            'id': video_id, +        info.update({              'display_id': display_id,              'title': title,              'description': get_element_by_attribute('class', 'text', webpage),              'series': series,              'season': season,              'episode': episode, -            'thumbnail': thumbnail, -            'duration': duration, -            'formats': formats, -        } +        }) +        return info diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 483f6925f..560fe188b 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor):          description = self._og_search_description(webpage)          like_count = parse_count(self._search_regex(              r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)', -            webpage, 'like count', fatal=False)) +            webpage, 'like count', default=None))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',               r'/listeners/?">([0-9,.]+)</a>'], -            webpage, 'play count', fatal=False)) +            webpage, 'play count', default=None))          return {              'id': track_id, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 000000000..1ec8e0f50 --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    ExtractorError, +    int_or_none, +    unescapeHTML, +) + + +class MSNIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' +    _TESTS = [{ +        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', +        'md5': '8442f66c116cbab1ff7098f986983458', +        'info_dict': { +            'id': 'BBqQYNE', +            'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', +            'ext': 'mp4', +            'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', +            'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', +            'duration': 104, +            'uploader': 'CBS Entertainment', +            'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', +        }, +    }, { +        'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', +        'only_matching': True, +    }, { +        'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', +        'only_matching': True, +    }, { +        # geo restricted +        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', +        'only_matching': True, +    }, { +        'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id, display_id = mobj.group('id', 'display_id') + +        webpage = self._download_webpage(url, display_id) + +        video = self._parse_json( +            self._search_regex( +                r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', +                webpage, 'video data', default='{}', group='data'), +            display_id, transform_source=unescapeHTML) + +        if not video: +            error = unescapeHTML(self._search_regex( +                r'data-error=(["\'])(?P<error>.+?)\1', +                webpage, 'error', group='error')) +            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + +        title = video['title'] + +        formats = [] +        for file_ in video.get('videoFiles', []): +            format_url = file_.get('url') +            if not format_url: +                continue +            ext = determine_ext(format_url) +            # .ism is not yet supported (see +            # https://github.com/rg3/youtube-dl/issues/8118) +            if ext == 'ism': +                continue +            if 'm3u8' in format_url: +                # m3u8_native should not be used here until +                # https://github.com/rg3/youtube-dl/issues/9913 is fixed +                m3u8_formats = self._extract_m3u8_formats( +                    format_url, display_id, 'mp4', +                    m3u8_id='hls', fatal=False) +                # Despite metadata in m3u8 all video+audio formats are +                # actually video-only (no audio) +                for f in m3u8_formats: +                    if f.get('acodec') != 'none' and f.get('vcodec') != 'none': +                        f['acodec'] = 'none' +                formats.extend(m3u8_formats) +            else: +                formats.append({ +                    'url': format_url, +                    'ext': 'mp4', +                    'format_id': 'http', +                    'width': int_or_none(file_.get('width')), +                    'height': int_or_none(file_.get('height')), +                }) +        self._sort_formats(formats) + +        subtitles = {} +        for file_ in video.get('files', []): +            format_url = file_.get('url') +            format_code = file_.get('formatCode') +            if not format_url or not format_code: +                continue +            if compat_str(format_code) == '3100': +                subtitles.setdefault(file_.get('culture', 'en'), []).append({ +                    'ext': determine_ext(format_url, 'ttml'), +                    'url': format_url, +                }) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': video.get('description'), +            'thumbnail': video.get('headlineImage', {}).get('url'), +            'duration': int_or_none(video.get('durationSecs')), +            'uploader': video.get('sourceFriendly'), +            'uploader_id': video.get('providerId'), +            'creator': video.get('creator'), +            'subtitles': subtitles, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 722518663..e717abb9f 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from .theplatform import ThePlatformIE  from ..utils import (      smuggle_url,      url_basename, @@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor):          } -class NationalGeographicChannelIE(InfoExtractor): +class NationalGeographicChannelIE(ThePlatformIE):      IE_NAME = 'natgeo:channel'      _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)' @@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor):          release_url = self._search_regex(              r'video_auth_playlist_url\s*=\s*"([^"]+)"',              webpage, 'release url') +        query = { +            'mbr': 'true', +            'switch': 'http', +        } +        is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) +        if is_auth == 'auth': +            auth_resource_id = self._search_regex( +                r"video_auth_resourceId\s*=\s*'([^']+)'", +                webpage, 'auth resource id') +            query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or ''          return {              '_type': 'url_transparent',              'ie_key': 'ThePlatform',              'url': smuggle_url( -                update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), +                update_url_query(release_url, query),                  {'force_smil_url': True}),              'display_id': display_id,          } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6b7da1149..f694e210b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,6 @@ from ..utils import (      lowercase_escape,      smuggle_url,      unescapeHTML, -    update_url_query, -    int_or_none, -    HEADRequest, -    parse_iso8601,  ) @@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor):  class NBCNewsIE(ThePlatformIE): -    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ +    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/          (?:video/.+?/(?P<id>\d+)| -        ([^/]+/)*(?P<display_id>[^/?]+)) +        ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))          '''      _TESTS = [ @@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE):                  'ext': 'mp4',                  'title': 'How Twitter Reacted To The Snowden Interview',                  'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', +                'uploader': 'NBCU-NEWS', +                'timestamp': 1401363060, +                'upload_date': '20140529',              },          },          {              'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',              'md5': 'fdbf39ab73a72df5896b6234ff98518a',              'info_dict': { -                'id': 'Wjf9EDR3A_60', +                'id': '529953347624',                  'ext': 'mp4',                  'title': 'FULL EPISODE: Family Business',                  'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', @@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE):                  'ext': 'mp4',                  'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',                  'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', +                'timestamp': 1423104900, +                'uploader': 'NBCU-NEWS', +                'upload_date': '20150205',              },          },          { @@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE):              'info_dict': {                  'id': '529953347624',                  'ext': 'mp4', -                'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', -                'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', +                'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', +                'description': 'md5:c8be487b2d80ff0594c005add88d8351', +                'upload_date': '20150922', +                'timestamp': 1442917800, +                'uploader': 'NBCU-NEWS',              }, -            'expected_warnings': ['http-6000 is not available']          },          {              'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', @@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE):                  'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',                  'upload_date': '20160420',                  'timestamp': 1461152093, +                'uploader': 'NBCU-NEWS', +            }, +        }, +        { +            'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', +            'md5': '6d236bf4f3dddc226633ce6e2c3f814d', +            'info_dict': { +                'id': '314487875924', +                'ext': 'mp4', +                'title': 'The chaotic GOP immigration vote', +                'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', +                'thumbnail': 're:^https?://.*\.jpg$', +                'timestamp': 1406937606, +                'upload_date': '20140802', +                'uploader': 'NBCU-NEWS', +                'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],              },          },          { @@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE):              }          else:              # "feature" and "nightly-news" pages use theplatform.com -            display_id = mobj.group('display_id') -            webpage = self._download_webpage(url, display_id) -            info = None -            bootstrap_json = self._search_regex( -                [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', -                 r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], -                webpage, 'bootstrap json', default=None) -            bootstrap = self._parse_json( -                bootstrap_json, display_id, transform_source=unescapeHTML) -            if 'results' in bootstrap: -                info = bootstrap['results'][0]['video'] -            elif 'video' in bootstrap: -                info = bootstrap['video'] -            else: -                info = bootstrap -            video_id = info['mpxId'] -            title = info['title'] - -            subtitles = {} -            caption_links = info.get('captionLinks') -            if caption_links: -                for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): -                    sub_url = caption_links.get(sub_key) -                    if sub_url: -                        subtitles.setdefault('en', []).append({ -                            'url': sub_url, -                            'ext': sub_ext, -                        }) - -            formats = [] -            for video_asset in info['videoAssets']: -                video_url = video_asset.get('publicUrl') -                if not video_url: -                    continue -                container = video_asset.get('format') -                asset_type = video_asset.get('assetType') or '' -                if container == 'ISM' or asset_type == 'FireTV-Once': -                    continue -                elif asset_type == 'OnceURL': -                    tp_formats, tp_subtitles = self._extract_theplatform_smil( -                        video_url, video_id) -                    formats.extend(tp_formats) -                    subtitles = self._merge_subtitles(subtitles, tp_subtitles) +            video_id = mobj.group('mpx_id') +            if not video_id.isdigit(): +                webpage = self._download_webpage(url, video_id) +                info = None +                bootstrap_json = self._search_regex( +                    [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', +                     r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], +                    webpage, 'bootstrap json', default=None) +                bootstrap = self._parse_json( +                    bootstrap_json, video_id, transform_source=unescapeHTML) +                if 'results' in bootstrap: +                    info = bootstrap['results'][0]['video'] +                elif 'video' in bootstrap: +                    info = bootstrap['video']                  else: -                    tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) -                    format_id = 'http%s' % ('-%d' % tbr if tbr else '') -                    video_url = update_url_query( -                        video_url, {'format': 'redirect'}) -                    # resolve the url so that we can check availability and detect the correct extension -                    head = self._request_webpage( -                        HEADRequest(video_url), video_id, -                        'Checking %s url' % format_id, -                        '%s is not available' % format_id, -                        fatal=False) -                    if head: -                        video_url = head.geturl() -                        formats.append({ -                            'format_id': format_id, -                            'url': video_url, -                            'width': int_or_none(video_asset.get('width')), -                            'height': int_or_none(video_asset.get('height')), -                            'tbr': tbr, -                            'container': video_asset.get('format'), -                        }) -            self._sort_formats(formats) +                    info = bootstrap +                video_id = info['mpxId']              return { +                '_type': 'url_transparent',                  'id': video_id, -                'title': title, -                'description': info.get('description'), -                'thumbnail': info.get('thumbnail'), -                'duration': int_or_none(info.get('duration')), -                'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), -                'formats': formats, -                'subtitles': subtitles, +                # http://feed.theplatform.com/f/2E2eJC/nbcnews also works +                'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, +                'ie_key': 'ThePlatformFeed',              } - - -class MSNBCIE(InfoExtractor): -    # https URLs redirect to corresponding http ones -    _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' -    _TEST = { -        'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', -        'md5': '6d236bf4f3dddc226633ce6e2c3f814d', -        'info_dict': { -            'id': 'n_hayes_Aimm_140801_272214', -            'ext': 'mp4', -            'title': 'The chaotic GOP immigration vote', -            'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', -            'thumbnail': 're:^https?://.*\.jpg$', -            'timestamp': 1406937606, -            'upload_date': '20140802', -            'uploader': 'NBCU-NEWS', -            'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], -        }, -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) -        embed_url = self._html_search_meta('embedURL', webpage) -        return self.url_result(embed_url) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index e96013791..4935002d0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -8,7 +8,7 @@ from ..utils import update_url_query  class NickIE(MTVServicesInfoExtractor):      IE_NAME = 'nick.com' -    _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)' +    _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'      _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'      _TESTS = [{          'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -52,6 +52,9 @@ class NickIE(MTVServicesInfoExtractor):                  }              },          ], +    }, { +        'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', +        'only_matching': True,      }]      def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py new file mode 100644 index 000000000..d889245ad --- /dev/null +++ b/youtube_dl/extractor/ninecninemedia.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_iso8601, +    parse_duration, +    ExtractorError +) + + +class NineCNineMediaIE(InfoExtractor): +    _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' + +    def _real_extract(self, url): +        destination_code, video_id = re.match(self._VALID_URL, url).groups() +        api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) +        content = self._download_json(api_base_url, video_id, query={ +            '$include': '[contentpackages]', +        }) +        title = content['Name'] +        if len(content['ContentPackages']) > 1: +            raise ExtractorError('multiple content packages') +        content_package = content['ContentPackages'][0] +        stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] +        stacks = self._download_json(stacks_base_url, video_id)['Items'] +        if len(stacks) > 1: +            raise ExtractorError('multiple stacks') +        stack = stacks[0] +        stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) +        formats = [] +        formats.extend(self._extract_m3u8_formats( +            stack_base_url + 'm3u8', video_id, 'mp4', +            'm3u8_native', m3u8_id='hls', fatal=False)) +        formats.extend(self._extract_f4m_formats( +            stack_base_url + 'f4m', video_id, +            f4m_id='hds', fatal=False)) +        mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) +        if mp4_url: +            formats.append({ +                'url': mp4_url, +                'format_id': 'mp4', +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': content.get('Desc') or content.get('ShortDesc'), +            'timestamp': parse_iso8601(content.get('BroadcastDateTime')), +            'duration': parse_duration(content.get('BroadcastTime')), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py new file mode 100644 index 000000000..402d3a9f7 --- /dev/null +++ b/youtube_dl/extractor/onet.py @@ -0,0 +1,172 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    ExtractorError, +    float_or_none, +    get_element_by_class, +    int_or_none, +    js_to_json, +    parse_iso8601, +    remove_start, +    strip_or_none, +    url_basename, +) + + +class OnetBaseIE(InfoExtractor): +    def _search_mvp_id(self, webpage): +        return self._search_regex( +            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + +    def _extract_from_id(self, video_id, webpage): +        response = self._download_json( +            'http://qi.ckm.onetapi.pl/', video_id, +            query={ +                'body[id]': video_id, +                'body[jsonrpc]': '2.0', +                'body[method]': 'get_asset_detail', +                'body[params][ID_Publikacji]': video_id, +                'body[params][Service]': 'www.onet.pl', +                'content-type': 'application/jsonp', +                'x-onet-app': 'player.front.onetapi.pl', +            }) + +        error = response.get('error') +        if error: +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + +        video = response['result'].get('0') + +        formats = [] +        for _, formats_dict in video['formats'].items(): +            if not isinstance(formats_dict, dict): +                continue +            for format_id, format_list in formats_dict.items(): +                if not isinstance(format_list, list): +                    continue +                for f in format_list: +                    video_url = f.get('url') +                    if not video_url: +                        continue +                    ext = determine_ext(video_url) +                    if format_id == 'ism': +                        # TODO: Support Microsoft Smooth Streaming +                        continue +                    elif ext == 'mpd': +                        # TODO: Current DASH formats are broken - $Time$ pattern in +                        # <SegmentTemplate> not implemented yet +                        # formats.extend(self._extract_mpd_formats( +                        #    video_url, video_id, mpd_id='dash', fatal=False)) +                        continue +                    else: +                        formats.append({ +                            'url': video_url, +                            'format_id': format_id, +                            'height': int_or_none(f.get('vertical_resolution')), +                            'width': int_or_none(f.get('horizontal_resolution')), +                            'abr': float_or_none(f.get('audio_bitrate')), +                            'vbr': float_or_none(f.get('video_bitrate')), +                        }) +        self._sort_formats(formats) + +        meta = video.get('meta', {}) + +        title = self._og_search_title(webpage, default=None) or meta['title'] +        description = self._og_search_description(webpage, default=None) or meta.get('description') +        duration = meta.get('length') or meta.get('lenght') +        timestamp = parse_iso8601(meta.get('addDate'), ' ') + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'formats': formats, +        } + + +class OnetIE(OnetBaseIE): +    _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)' +    IE_NAME = 'onet.tv' + +    _TEST = { +        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', +        'md5': 'e3ffbf47590032ac3f27249204173d50', +        'info_dict': { +            'id': 'qbpyqc', +            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', +            'ext': 'mp4', +            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', +            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', +            'upload_date': '20160705', +            'timestamp': 1467721580, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        display_id, video_id = mobj.group('display_id', 'id') + +        webpage = self._download_webpage(url, display_id) + +        mvp_id = self._search_mvp_id(webpage) + +        info_dict = self._extract_from_id(mvp_id, webpage) +        info_dict.update({ +            'id': video_id, +            'display_id': display_id, +        }) + +        return info_dict + + +class OnetChannelIE(OnetBaseIE): +    _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)' +    IE_NAME = 'onet.tv:channel' + +    _TEST = { +        'url': 'http://onet.tv/k/openerfestival', +        'info_dict': { +            'id': 'openerfestival', +            'title': 'Open\'er Festival Live', +            'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', +        }, +        'playlist_mincount': 46, +    } + +    def _real_extract(self, url): +        channel_id = self._match_id(url) + +        webpage = self._download_webpage(url, channel_id) + +        current_clip_info = self._parse_json(self._search_regex( +            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, +            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) +        video_id = remove_start(current_clip_info['ckmId'], 'mvp:') +        video_name = url_basename(current_clip_info['url']) + +        if self._downloader.params.get('noplaylist'): +            self.to_screen( +                'Downloading just video %s because of --no-playlist' % video_name) +            return self._extract_from_id(video_id, webpage) + +        self.to_screen( +            'Downloading channel %s - add --no-playlist to just download video %s' % ( +                channel_id, video_name)) +        matches = re.findall( +            r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', +            webpage) +        entries = [ +            self.url_result(video_link, OnetIE.ie_key()) +            for video_link in matches] + +        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) +        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) +        return self.playlist_result(entries, channel_id, channel_title, channel_description) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index d7b13a0f1..6fb1a3fcc 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -7,6 +7,8 @@ from .common import InfoExtractor  from ..utils import (      determine_ext,      int_or_none, +    float_or_none, +    mimetype2ext,  ) @@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', -        'md5': 'd4851405d31adfadf71cd7a487b765bb', +        'md5': 'e49f947c105b8a78a675a0ee1bddedfe',          'info_dict': {              'id': '2937',              'ext': 'mp4',              'title': 'Hannibal charges forward, stops for a cocktail', -            'description': 'md5:e786add7f280b7f0fe237b64cc73df76',              'thumbnail': 're:^https?://.*\.jpg$',              'uploader': 'The A.V. Club', -            'uploader_id': 'TheAVClub', +            'uploader_id': 'the-av-club',          },      }, {          'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage( -            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) +        video_data = self._download_json( +            'http://www.onionstudios.com/video/%s.json' % video_id, video_id) + +        title = video_data['title']          formats = [] -        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): -            ext = determine_ext(src) +        for source in video_data.get('sources', []): +            source_url = source.get('url') +            if not source_url: +                continue +            ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)              if ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))              else: -                height = int_or_none(self._search_regex( -                    r'/(\d+)\.%s' % ext, src, 'height', default=None)) +                tbr = int_or_none(source.get('bitrate'))                  formats.append({ -                    'format_id': ext + ('-%sp' % height if height else ''), -                    'url': src, -                    'height': height, +                    'format_id': ext + ('-%d' % tbr if tbr else ''), +                    'url': source_url, +                    'width': int_or_none(source.get('width')), +                    'tbr': tbr,                      'ext': ext, -                    'preference': 1,                  })          self._sort_formats(formats) -        title = self._search_regex( -            r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1', -            webpage, 'title', group='title') -        description = self._search_regex( -            r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', -            webpage, 'description', default=None, group='description') -        thumbnail = self._search_regex( -            r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', -            webpage, 'thumbnail', default=False, group='thumbnail') - -        uploader_id = self._search_regex( -            r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', -            webpage, 'uploader id', fatal=False, group='uploader_id') -        uploader = self._search_regex( -            r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', -            webpage, 'uploader', default=False, group='uploader') -          return {              'id': video_id,              'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'uploader': uploader, -            'uploader_id': uploader_id, +            'thumbnail': video_data.get('poster_url'), +            'uploader': video_data.get('channel_name'), +            'uploader_id': video_data.get('channel_slug'), +            'duration': float_or_none(video_data.get('duration', 1000)), +            'tags': video_data.get('tags'),              'formats': formats,          } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac6e..f6f423597 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -516,9 +516,14 @@ class PBSIE(InfoExtractor):                  # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications                  if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):                      continue +                f_url = re.sub(r'\d+k|baseline', bitrate, http_url) +                # This may produce invalid links sometimes (e.g. +                # http://www.pbs.org/wgbh/frontline/film/suicide-plan) +                if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): +                    continue                  f = m3u8_format.copy()                  f.update({ -                    'url': re.sub(r'\d+k|baseline', bitrate, http_url), +                    'url': f_url,                      'format_id': m3u8_format['format_id'].replace('hls', 'http'),                      'protocol': 'http',                  }) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c23b314e7..75f5884a9 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor):          title = user.get('display_name') or user.get('username')          description = user.get('description') +        broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or +                         data_store.get('BroadcastCache', {}).get('broadcastIds', [])) +          entries = [              self.url_result( -                'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) -            for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] +                'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) +            for broadcast_id in broadcast_ids]          return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index bc559d1df..77e1211d6 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -49,7 +49,7 @@ class PladformIE(InfoExtractor):      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 000000000..f559b899f --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_str, +    compat_urllib_parse_unquote, +) +from ..utils import ( +    int_or_none, +    strip_or_none, +    unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', +        'info_dict': { +            'id': '1587943', +            'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', +            'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', +        }, +        'playlist': [{ +            'md5': '2984ee6ce9046d91fc233bc1a864a09a', +            'info_dict': { +                'id': '1540576', +                'ext': 'mp3', +                'title': 'md5:d4623290d4ac983bf924061c75c23a0d', +                'timestamp': 1456594200, +                'upload_date': '20160227', +                'duration': 2364, +                'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' +            }, +        }], +    }, { +        'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', +        'info_dict': { +            'id': '1635803', +            'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', +            'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', +        }, +        'playlist_mincount': 12, +    }, { +        'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', +        'only_matching': True, +    }, { +        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', +        'only_matching': True, +    }, { +        # with mp4 video +        'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        content = self._search_regex( +            r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', +            webpage, 'content') + +        timestamp = unified_timestamp(self._html_search_regex( +            r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', +            webpage, 'timestamp', fatal=False)) + +        thumbnail_url = self._og_search_thumbnail(webpage) + +        entries = [] + +        media_urls = set() + +        for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): +            media = self._parse_json(data_media, playlist_id, fatal=False) +            if not media.get('file') or not media.get('desc'): +                continue +            media_url = self._proto_relative_url(media['file'], 'http:') +            if media_url in media_urls: +                continue +            media_urls.add(media_url) +            entries.append({ +                'id': compat_str(media['id']), +                'url': media_url, +                'title': compat_urllib_parse_unquote(media['desc']), +                'duration': int_or_none(media.get('length')), +                'vcodec': 'none' if media.get('provider') == 'audio' else None, +                'timestamp': timestamp, +                'thumbnail': thumbnail_url +            }) + +        title = self._og_search_title(webpage).strip() +        description = strip_or_none(self._og_search_description(webpage)) + +        return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 6d57e1d35..d2c92531b 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -25,7 +25,15 @@ from ..aes import (  class PornHubIE(InfoExtractor): -    _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' +    IE_DESC = 'PornHub and Thumbzilla' +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| +                            (?:www\.)?thumbzilla\.com/video/ +                        ) +                        (?P<id>[0-9a-z]+) +                    '''      _TESTS = [{          'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',          'md5': '1e19b41231a02eba417839222ac9d58e', @@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor):          'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',          'only_matching': True,      }, { +        # removed at the request of cam4.com          'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',          'only_matching': True, +    }, { +        # removed at the request of the copyright owner +        'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', +        'only_matching': True, +    }, { +        # removed by uploader +        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', +        'only_matching': True, +    }, { +        # private video +        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', +        'only_matching': True, +    }, { +        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', +        'only_matching': True,      }]      @classmethod @@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor):          webpage = self._download_webpage(req, video_id)          error_msg = self._html_search_regex( -            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', -            webpage, 'error message', default=None) +            r'(?s)<div[^>]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P<error>.+?)</div>', +            webpage, 'error message', default=None, group='error')          if error_msg:              error_msg = re.sub(r'\s+', ' ', error_msg)              raise ExtractorError( diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 07d49d489..c6eee3b72 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,7 +5,7 @@ import re  from hashlib import sha1  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import compat_str  from ..utils import (      ExtractorError,      determine_ext, @@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor):                  # rtmp download                  'skip_download': True,              }, +            'skip': 'This video is unavailable',          },          {              'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):                  # rtmp download                  'skip_download': True,              }, +            'skip': 'This video is unavailable',          },          {              'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor):                  # rtmp download                  'skip_download': True,              }, +            'skip': 'This video is unavailable',          },          {              'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor):                  # rtmp download                  'skip_download': True,              }, +            'skip': 'This video is unavailable',          },          {              'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor):                  # rtmp download                  'skip_download': True,              }, +            'skip': 'This video is unavailable',          },          {              'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', @@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor):      ]      def _extract_clip(self, url, webpage): -        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') +        clip_id = self._html_search_regex( +            self._CLIPID_REGEXES, webpage, 'clip id')          access_token = 'prosieben'          client_name = 'kolibri-2.0.19-splec4'          client_location = url -        videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ -            'access_token': access_token, -            'client_location': client_location, -            'client_name': client_name, -            'ids': clip_id, -        }) - -        video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] +        video = self._download_json( +            'http://vas.sim-technik.de/vas/live/v2/videos', +            clip_id, 'Downloading videos JSON', query={ +                'access_token': access_token, +                'client_location': client_location, +                'client_name': client_name, +                'ids': clip_id, +            })[0]          if video.get('is_protected') is True:              raise ExtractorError('This video is DRM protected.', expected=True)          duration = float_or_none(video.get('duration')) -        source_ids = [source['id'] for source in video['sources']] -        source_ids_str = ','.join(map(str, source_ids)) +        source_ids = [compat_str(source['id']) for source in video['sources']]          g = '01!8d8F_)r9]4s[qeuXfP%' +        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() -        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) -                                 .encode('utf-8')).hexdigest() - -        sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ -            'access_token': access_token, -            'client_id': client_id, -            'client_location': client_location, -            'client_name': client_name, -        })) - -        sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') +        sources = self._download_json( +            'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, +            clip_id, 'Downloading sources JSON', query={ +                'access_token': access_token, +                'client_id': client_id, +                'client_location': client_location, +                'client_name': client_name, +            })          server_id = sources['server_id'] -        client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, -                                          client_location, source_ids_str, g, client_name]) -                                 .encode('utf-8')).hexdigest() - -        url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ -            'access_token': access_token, -            'client_id': client_id, -            'client_location': client_location, -            'client_name': client_name, -            'server_id': server_id, -            'source_ids': source_ids_str, -        })) - -        urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') -          title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') -        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) -        thumbnail = self._og_search_thumbnail(webpage) - -        upload_date = unified_strdate(self._html_search_regex( -            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - -        formats = [] - -        urls_sources = urls['sources'] -        if isinstance(urls_sources, dict): -            urls_sources = urls_sources.values()          def fix_bitrate(bitrate):              bitrate = int_or_none(bitrate) @@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor):                  return None              return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate -        for source in urls_sources: -            protocol = source['protocol'] -            source_url = source['url'] -            if protocol == 'rtmp' or protocol == 'rtmpe': -                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) -                if not mobj: -                    continue -                path = mobj.group('path') -                mp4colon_index = path.rfind('mp4:') -                app = path[:mp4colon_index] -                play_path = path[mp4colon_index:] -                formats.append({ -                    'url': '%s/%s' % (mobj.group('url'), app), -                    'app': app, -                    'play_path': play_path, -                    'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', -                    'page_url': 'http://www.prosieben.de', -                    'vbr': fix_bitrate(source['bitrate']), -                    'ext': 'mp4', -                    'format_id': '%s_%s' % (source['cdn'], source['bitrate']), -                }) -            elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': -                formats.extend(self._extract_f4m_formats(source_url, clip_id)) -            else: -                formats.append({ -                    'url': source_url, -                    'vbr': fix_bitrate(source['bitrate']), +        formats = [] +        for source_id in source_ids: +            client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() +            urls = self._download_json( +                'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, +                clip_id, 'Downloading urls JSON', fatal=False, query={ +                    'access_token': access_token, +                    'client_id': client_id, +                    'client_location': client_location, +                    'client_name': client_name, +                    'server_id': server_id, +                    'source_ids': source_id,                  }) - +            if not urls: +                continue +            if urls.get('status_code') != 0: +                raise ExtractorError('This video is unavailable', expected=True) +            urls_sources = urls['sources'] +            if isinstance(urls_sources, dict): +                urls_sources = urls_sources.values() +            for source in urls_sources: +                source_url = source.get('url') +                if not source_url: +                    continue +                protocol = source.get('protocol') +                mimetype = source.get('mimetype') +                if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': +                    formats.extend(self._extract_f4m_formats( +                        source_url, clip_id, f4m_id='hds', fatal=False)) +                elif mimetype == 'application/x-mpegURL': +                    formats.extend(self._extract_m3u8_formats( +                        source_url, clip_id, 'mp4', 'm3u8_native', +                        m3u8_id='hls', fatal=False)) +                else: +                    tbr = fix_bitrate(source['bitrate']) +                    if protocol in ('rtmp', 'rtmpe'): +                        mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) +                        if not mobj: +                            continue +                        path = mobj.group('path') +                        mp4colon_index = path.rfind('mp4:') +                        app = path[:mp4colon_index] +                        play_path = path[mp4colon_index:] +                        formats.append({ +                            'url': '%s/%s' % (mobj.group('url'), app), +                            'app': app, +                            'play_path': play_path, +                            'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', +                            'page_url': 'http://www.prosieben.de', +                            'tbr': tbr, +                            'ext': 'flv', +                            'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), +                        }) +                    else: +                        formats.append({ +                            'url': source_url, +                            'tbr': tbr, +                            'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), +                        })          self._sort_formats(formats) +        description = self._html_search_regex( +            self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) +        upload_date = unified_strdate(self._html_search_regex( +            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) +          return {              'id': clip_id,              'title': title, diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 4f05bbddc..8ec402646 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -12,6 +12,7 @@ from ..utils import (      unified_strdate,      xpath_element,      ExtractorError, +    determine_protocol,  ) @@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor):          'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',          'info_dict': {              'id': '7184272', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Le parcours du tireur capté sur vidéo',              'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',              'upload_date': '20141023',          },          'params': { -            # rtmp download +            # m3u8 download              'skip_download': True,          },      } @@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor):      def _real_extract(self, url):          app_code, video_id = re.match(self._VALID_URL, url).groups() +        device_types = ['ipad', 'android'] +        if app_code != 'toutv': +            device_types.append('flash') +          formats = [] -        # TODO: extract m3u8 and f4m formats -        # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements +        # TODO: extract f4m formats          # f4m formats can be extracted using flashhd device_type but they produce unplayable file -        for device_type in ('flash',): +        for device_type in device_types:              v_data = self._download_xml(                  'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',                  video_id, note='Downloading %s XML' % device_type, query={ @@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor):                      # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction                      'paysJ391wsHjbOJwvCs26toz': 'CA',                      'bypasslock': 'NZt5K62gRqfc', -                }) +                }, fatal=False)              v_url = xpath_text(v_data, 'url')              if not v_url:                  continue @@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor):                  formats.extend(self._extract_m3u8_formats(                      v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))              elif ext == 'f4m': -                formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) +                formats.extend(self._extract_f4m_formats( +                    v_url, video_id, f4m_id='hds', fatal=False))              else:                  ext = determine_ext(v_url)                  bitrates = xpath_element(v_data, 'bitrates') @@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor):                      tbr = int_or_none(url_e.get('bitrate'))                      if not tbr:                          continue +                    f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) +                    protocol = determine_protocol({'url': f_url})                      formats.append({ -                        'format_id': 'rtmp-%d' % tbr, -                        'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), -                        'ext': 'flv', -                        'protocol': 'rtmp', +                        'format_id': '%s-%d' % (protocol, tbr), +                        'url': f_url, +                        'ext': 'flv' if protocol == 'rtmp' else ext, +                        'protocol': protocol,                          'width': int_or_none(url_e.get('width')),                          'height': int_or_none(url_e.get('height')),                          'tbr': tbr,                      }) +                    if protocol == 'rtsp': +                        base_url = self._search_regex( +                            r'rtsp://([^?]+)', f_url, 'base url', default=None) +                        if base_url: +                            base_url = 'http://' + base_url +                            formats.extend(self._extract_m3u8_formats( +                                base_url + '/playlist.m3u8', video_id, 'mp4', +                                'm3u8_native', m3u8_id='hls', fatal=False)) +                            formats.extend(self._extract_f4m_formats( +                                base_url + '/manifest.f4m', video_id, +                                f4m_id='hds', fatal=False))          self._sort_formats(formats)          metadata = self._download_xml( @@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor):          'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',          'info_dict': {              'id': '7527184', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Barack Obama au Vietnam',              'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',              'upload_date': '20160523',          },          'params': { -            # rtmp download +            # m3u8 download              'skip_download': True,          },      } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1aa1..dc640b1bc 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,47 +1,141 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urlparse, -) +from ..compat import compat_urlparse  from ..utils import ( -    ExtractorError,      determine_ext, +    ExtractorError, +    find_xpath_attr, +    fix_xml_ampersands, +    int_or_none,      parse_duration,      unified_strdate, -    int_or_none, +    update_url_query,      xpath_text,  ) -class RaiTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): +    def _extract_relinker_formats(self, relinker_url, video_id): +        formats = [] + +        for platform in ('mon', 'flash', 'native'): +            relinker = self._download_xml( +                relinker_url, video_id, +                note='Downloading XML metadata for platform %s' % platform, +                transform_source=fix_xml_ampersands, +                query={'output': 45, 'pl': platform}, +                headers=self.geo_verification_headers()) + +            media_url = find_xpath_attr(relinker, './url', 'type', 'content').text +            if media_url == 'http://download.rai.it/video_no_available.mp4': +                self.raise_geo_restricted() + +            ext = determine_ext(media_url) +            if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): +                continue + +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    media_url, video_id, 'mp4', 'm3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif ext == 'f4m': +                manifest_url = update_url_query( +                    media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), +                    {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) +                formats.extend(self._extract_f4m_formats( +                    manifest_url, video_id, f4m_id='hds', fatal=False)) +            else: +                bitrate = int_or_none(xpath_text(relinker, 'bitrate')) +                formats.append({ +                    'url': media_url, +                    'tbr': bitrate if bitrate > 0 else None, +                    'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', +                }) + +        return formats + +    def _extract_from_content_id(self, content_id, base_url): +        media = self._download_json( +            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, +            content_id, 'Downloading video JSON') + +        thumbnails = [] +        for image_type in ('image', 'image_medium', 'image_300'): +            thumbnail_url = media.get(image_type) +            if thumbnail_url: +                thumbnails.append({ +                    'url': compat_urlparse.urljoin(base_url, thumbnail_url), +                }) + +        formats = [] +        media_type = media['type'] +        if 'Audio' in media_type: +            formats.append({ +                'format_id': media.get('formatoAudio'), +                'url': media['audioUrl'], +                'ext': media.get('formatoAudio'), +            }) +        elif 'Video' in media_type: +            formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) +            self._sort_formats(formats) +        else: +            raise ExtractorError('not a media file') + +        subtitles = {} +        captions = media.get('subtitlesUrl') +        if captions: +            STL_EXT = '.stl' +            SRT_EXT = '.srt' +            if captions.endswith(STL_EXT): +                captions = captions[:-len(STL_EXT)] + SRT_EXT +            subtitles['it'] = [{ +                'ext': 'srt', +                'url': captions, +            }] + +        return { +            'id': content_id, +            'title': media['name'], +            'description': media.get('desc'), +            'thumbnails': thumbnails, +            'uploader': media.get('author'), +            'upload_date': unified_strdate(media.get('date')), +            'duration': parse_duration(media.get('length')), +            'formats': formats, +            'subtitles': subtitles, +        } + + +class RaiTVIE(RaiBaseIE): +    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'      _TESTS = [          {              'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', -            'md5': '96382709b61dd64a6b88e0f791e6df4c', +            'md5': '8970abf8caf8aef4696e7b1f2adfc696',              'info_dict': {                  'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Report del 07/04/2014',                  'description': 'md5:f27c544694cacb46a078db84ec35d2d9',                  'upload_date': '20140407',                  'duration': 6160, +                'thumbnail': 're:^https?://.*\.jpg$',              }          },          { +            # no m3u8 stream              'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', -            'md5': 'd9751b78eac9710d62c2447b224dea39', +            # HDS download, MD5 is unstable              'info_dict': {                  'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',                  'ext': 'flv',                  'title': 'TG PRIMO TEMPO',                  'upload_date': '20140612',                  'duration': 1758, +                'thumbnail': 're:^https?://.*\.jpg$',              }, +            'skip': 'Geo-restricted to Italy',          },          {              'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor):          },          {              'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', -            'md5': '496ab63e420574447f70d02578333437', +            'md5': 'e57493e1cb8bc7c564663f363b171847',              'info_dict': {                  'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Il Candidato - Primo episodio: "Le Primarie"',                  'description': 'md5:364b604f7db50594678f483353164fb8',                  'upload_date': '20140923',                  'duration': 386, +                'thumbnail': 're:^https?://.*\.jpg$',              }          },      ]      def _real_extract(self, url):          video_id = self._match_id(url) -        media = self._download_json( -            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, -            video_id, 'Downloading video JSON') - -        thumbnails = [] -        for image_type in ('image', 'image_medium', 'image_300'): -            thumbnail_url = media.get(image_type) -            if thumbnail_url: -                thumbnails.append({ -                    'url': thumbnail_url, -                }) - -        subtitles = [] -        formats = [] -        media_type = media['type'] -        if 'Audio' in media_type: -            formats.append({ -                'format_id': media.get('formatoAudio'), -                'url': media['audioUrl'], -                'ext': media.get('formatoAudio'), -            }) -        elif 'Video' in media_type: -            def fix_xml(xml): -                return xml.replace(' tag elementi', '').replace('>/', '</') - -            relinker = self._download_xml( -                media['mediaUri'] + '&output=43', -                video_id, transform_source=fix_xml) - -            has_subtitle = False - -            for element in relinker.findall('element'): -                media_url = xpath_text(element, 'url') -                ext = determine_ext(media_url) -                content_type = xpath_text(element, 'content-type') -                if ext == 'm3u8': -                    formats.extend(self._extract_m3u8_formats( -                        media_url, video_id, 'mp4', 'm3u8_native', -                        m3u8_id='hls', fatal=False)) -                elif ext == 'f4m': -                    formats.extend(self._extract_f4m_formats( -                        media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', -                        video_id, f4m_id='hds', fatal=False)) -                elif ext == 'stl': -                    has_subtitle = True -                elif content_type.startswith('video/'): -                    bitrate = int_or_none(xpath_text(element, 'bitrate')) -                    formats.append({ -                        'url': media_url, -                        'tbr': bitrate if bitrate > 0 else None, -                        'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', -                    }) -                elif content_type.startswith('image/'): -                    thumbnails.append({ -                        'url': media_url, -                    }) - -            self._sort_formats(formats) -            if has_subtitle: -                webpage = self._download_webpage(url, video_id) -                subtitles = self._get_subtitles(video_id, webpage) -        else: -            raise ExtractorError('not a media file') +        return self._extract_from_content_id(video_id, url) -        return { -            'id': video_id, -            'title': media['name'], -            'description': media.get('desc'), -            'thumbnails': thumbnails, -            'uploader': media.get('author'), -            'upload_date': unified_strdate(media.get('date')), -            'duration': parse_duration(media.get('length')), -            'formats': formats, -            'subtitles': subtitles, -        } -    def _get_subtitles(self, video_id, webpage): -        subtitles = {} -        m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) -        if m: -            captions = m.group('captions') -            STL_EXT = '.stl' -            SRT_EXT = '.srt' -            if captions.endswith(STL_EXT): -                captions = captions[:-len(STL_EXT)] + SRT_EXT -            subtitles['it'] = [{ -                'ext': 'srt', -                'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), -            }] -        return subtitles - - -class RaiIE(InfoExtractor): +class RaiIE(RaiBaseIE):      _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'      _TESTS = [          {              'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', -            'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', +            'md5': '2dd727e61114e1ee9c47f0da6914e178',              'info_dict': {                  'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'Il pacco',                  'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',                  'upload_date': '20141221',              }, -        } +        }, +        { +            # Direct relinker URL +            'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', +            # HDS live stream, MD5 is unstable +            'info_dict': { +                'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', +                'ext': 'flv', +                'title': 'EuroNews', +            }, +            'skip': 'Geo-restricted to Italy', +        }, +        { +            # Embedded content item ID +            'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', +            'md5': '84c1135ce960e8822ae63cec34441d63', +            'info_dict': { +                'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', +                'ext': 'mp4', +                'title': 'TG1 ore 20:00 del 02/07/2016', +                'upload_date': '20160702', +            }, +        }, +        { +            'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', +            # HDS live stream, MD5 is unstable +            'info_dict': { +                'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', +                'ext': 'flv', +                'title': 'La diretta di Rainews24', +            }, +        },      ]      @classmethod @@ -201,7 +238,30 @@ class RaiIE(InfoExtractor):          iframe_url = self._search_regex(              [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',               r'drawMediaRaiTV\(["\'](.+?)["\']'], -            webpage, 'iframe') -        if not iframe_url.startswith('http'): -            iframe_url = compat_urlparse.urljoin(url, iframe_url) -        return self.url_result(iframe_url) +            webpage, 'iframe', default=None) +        if iframe_url: +            if not iframe_url.startswith('http'): +                iframe_url = compat_urlparse.urljoin(url, iframe_url) +            return self.url_result(iframe_url) + +        content_item_id = self._search_regex( +            r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', +            webpage, 'content item ID', group='content_id', default=None) +        if content_item_id: +            return self._extract_from_content_id(content_item_id, url) + +        relinker_url = compat_urlparse.urljoin(url, self._search_regex( +            r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', +            webpage, 'relinker URL', group='url')) +        formats = self._extract_relinker_formats(relinker_url, video_id) +        self._sort_formats(formats) + +        title = self._search_regex( +            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', +            webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 796adfdf9..bf200ea4d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -1,23 +1,23 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      parse_duration,      parse_iso8601, +    js_to_json,  ) +from ..compat import compat_str  class RDSIE(InfoExtractor):      IE_DESC = 'RDS.ca' -    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' +    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'      _TESTS = [{          'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',          'info_dict': { -            'id': '3.1132799', +            'id': '604333',              'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',              'ext': 'mp4',              'title': 'Fowler Jr. prend la direction de Jacksonville', @@ -33,22 +33,17 @@ class RDSIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        display_id = mobj.group('display_id') +        display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        # TODO: extract f4m from 9c9media.com -        video_url = self._search_regex( -            r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', -            webpage, 'video url') - -        title = self._og_search_title(webpage) or self._html_search_meta( +        item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) +        video_id = compat_str(item['id']) +        title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(              'title', webpage, 'title', fatal=True)          description = self._og_search_description(webpage) or self._html_search_meta(              'description', webpage, 'description') -        thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( +        thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(              [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',               r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],              webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ class RDSIE(InfoExtractor):          age_limit = self._family_friendly_search(webpage)          return { +            '_type': 'url_transparent',              'id': video_id,              'display_id': display_id, -            'url': video_url, +            'url': '9c9media:rds_web:%s' % video_id,              'title': title,              'description': description,              'thumbnail': thumbnail,              'timestamp': timestamp,              'duration': duration,              'age_limit': age_limit, +            'ie_key': 'NineCNineMedia',          } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 000000000..f5b2f560c --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    strip_or_none, +    unescapeHTML, +    urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): +    _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' +    _LOGIN_URL = 'https://roosterteeth.com/login' +    _NETRC_MACHINE = 'roosterteeth' +    _TESTS = [{ +        'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', +        'md5': 'e2bd7764732d785ef797700a2489f212', +        'info_dict': { +            'id': '26576', +            'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', +            'ext': 'mp4', +            'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', +            'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', +            'thumbnail': 're:^https?://.*\.png$', +            'series': 'Million Dollars, But...', +            'episode': 'Million Dollars, But... The Game Announcement', +            'comment_count': int, +        }, +    }, { +        'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', +        'only_matching': True, +    }, { +        'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', +        'only_matching': True, +    }, { +        'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', +        'only_matching': True, +    }, { +        'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', +        'only_matching': True, +    }, { +        # only available for FIRST members +        'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', +        'only_matching': True, +    }] + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, +            note='Downloading login page', +            errnote='Unable to download login page') + +        login_form = self._hidden_inputs(login_page) + +        login_form.update({ +            'username': username, +            'password': password, +        }) + +        login_request = self._download_webpage( +            self._LOGIN_URL, None, +            note='Logging in as %s' % username, +            data=urlencode_postdata(login_form), +            headers={ +                'Referer': self._LOGIN_URL, +            }) + +        if not any(re.search(p, login_request) for p in ( +                r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', +                r'>Sign Out<')): +            error = self._html_search_regex( +                r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', +                login_request, 'alert', default=None, group='error') +            if error: +                raise ExtractorError('Unable to login: %s' % error, expected=True) +            raise ExtractorError('Unable to log in') + +    def _real_initialize(self): +        self._login() + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        episode = strip_or_none(unescapeHTML(self._search_regex( +            (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', +             r'<title>(?P<title>[^<]+)</title>'), webpage, 'title', +            default=None, group='title'))) + +        title = strip_or_none(self._og_search_title( +            webpage, default=None)) or episode + +        m3u8_url = self._search_regex( +            r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1', +            webpage, 'm3u8 url', default=None, group='url') + +        if not m3u8_url: +            if re.search(r'<div[^>]+class=["\']non-sponsor', webpage): +                self.raise_login_required( +                    '%s is only available for FIRST members' % display_id) + +            if re.search(r'<div[^>]+class=["\']golive-gate', webpage): +                self.raise_login_required('%s is not available yet' % display_id) + +            raise ExtractorError('Unable to extract m3u8 URL') + +        formats = self._extract_m3u8_formats( +            m3u8_url, display_id, ext='mp4', +            entry_protocol='m3u8_native', m3u8_id='hls') +        self._sort_formats(formats) + +        description = strip_or_none(self._og_search_description(webpage)) +        thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + +        series = self._search_regex( +            (r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'), +            webpage, 'series', fatal=False) + +        comment_count = int_or_none(self._search_regex( +            r'>Comments \((\d+)\)<', webpage, +            'comment count', fatal=False)) + +        video_id = self._search_regex( +            (r'containerId\s*=\s*["\']episode-(\d+)\1', +             r'<div[^<]+id=["\']episode-(\d+)'), webpage, +            'video id', default=display_id) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'series': series, +            'episode': episode, +            'comment_count': comment_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 4896d09d6..f6454c6b0 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.rtvnh.nl/video/131946', -        'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', +        'md5': 'cdbec9f44550763c8afc96050fa747dc',          'info_dict': {              'id': '131946',              'ext': 'mp4', @@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor):              raise ExtractorError(                  '%s returned error code %d' % (self.IE_NAME, status), expected=True) -        formats = self._extract_smil_formats( -            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) - -        for item in meta['source']['fb']: -            if item.get('type') == 'hls': -                formats.extend(self._extract_m3u8_formats( -                    item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) -            elif item.get('type') == '': -                formats.append({'url': item['file']}) +        formats = [] +        rtmp_formats = self._extract_smil_formats( +            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) +        formats.extend(rtmp_formats) + +        for rtmp_format in rtmp_formats: +            rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) +            rtsp_format = rtmp_format.copy() +            del rtsp_format['play_path'] +            del rtsp_format['ext'] +            rtsp_format.update({ +                'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), +                'url': rtmp_url.replace('rtmp://', 'rtsp://'), +                'protocol': 'rtsp', +            }) +            formats.append(rtsp_format) +            http_base_url = rtmp_url.replace('rtmp://', 'http://') +            formats.extend(self._extract_m3u8_formats( +                http_base_url + '/playlist.m3u8', video_id, 'mp4', +                'm3u8_native', m3u8_id='hls', fatal=False)) +            formats.extend(self._extract_f4m_formats( +                http_base_url + '/manifest.f4m', +                video_id, f4m_id='hds', fatal=False))          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a49..96e43af84 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -1,18 +1,12 @@  # coding: utf-8  from __future__ import unicode_literals -import itertools  import json -import re  from .common import InfoExtractor -from ..compat import compat_urlparse  from ..utils import (      int_or_none, -    js_to_json,      mimetype2ext, -    sanitized_Request, -    unified_strdate,  ) @@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor):              'ext': 'mp4',              'title': 'Xyce Software Training - Section 1',              'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', -            'upload_date': '20120904', +            'upload_date': '20120409', +            'timestamp': 1333983600,              'duration': 7794,          }      } @@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        req = sanitized_Request(url) -        req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') -        webpage = self._download_webpage(req, video_id) +        presentation_data = self._download_json( +            'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', +            video_id, data=json.dumps({ +                'getPlayerOptionsRequest': { +                    'ResourceId': video_id, +                    'QueryString': '', +                } +            }), headers={ +                'Content-Type': 'application/json; charset=utf-8', +            })['d']['Presentation'] -        js_path = self._search_regex( -            r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"', -            webpage, 'JS code URL') -        js_url = compat_urlparse.urljoin(url, js_path) - -        js_code = self._download_webpage( -            js_url, video_id, note='Downloading player') - -        def extract_str(key, **args): -            return self._search_regex( -                r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key), -                js_code, key, **args) - -        def extract_data(key, **args): -            data_json = extract_str(key, **args) -            if data_json is None: -                return data_json -            return self._parse_json( -                data_json, video_id, transform_source=js_to_json) +        title = presentation_data['Title']          formats = [] -        for i in itertools.count(): -            fd = extract_data('VideoUrls[%d]' % i, default=None) -            if fd is None: -                break -            formats.append({ -                'format_id': '%s' % i, -                'format_note': fd['MimeType'].partition('/')[2], -                'ext': mimetype2ext(fd['MimeType']), -                'url': fd['Location'], -                'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, -            }) +        for stream in presentation_data.get('Streams', []): +            for fd in stream.get('VideoUrls', []): +                formats.append({ +                    'format_id': fd['MediaType'], +                    'format_note': fd['MimeType'].partition('/')[2], +                    'ext': mimetype2ext(fd['MimeType']), +                    'url': fd['Location'], +                    'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, +                })          self._sort_formats(formats) -        slide_baseurl = compat_urlparse.urljoin( -            url, extract_data('SlideBaseUrl')) -        slide_template = slide_baseurl + re.sub( -            r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate')) -        slides = [] -        last_slide_time = 0 -        for i in itertools.count(1): -            sd = extract_str('Slides[%d]' % i, default=None) -            if sd is None: -                break -            timestamp = int_or_none(self._search_regex( -                r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),', -                sd, 'slide %s timestamp' % i, fatal=False)) -            slides.append({ -                'url': slide_template % i, -                'duration': timestamp - last_slide_time, -            }) -            last_slide_time = timestamp -        formats.append({ -            'format_id': 'slides', -            'protocol': 'slideshow', -            'url': json.dumps(slides), -            'preference': -10000,  # Downloader not yet written -        }) -        self._sort_formats(formats) - -        title = extract_data('Title') -        description = extract_data('Description', fatal=False) -        duration = int_or_none(extract_data( -            'Duration', fatal=False), scale=1000) -        upload_date = unified_strdate(extract_data('AirDate', fatal=False)) -          return {              'id': video_id,              'title': title, -            'description': description, +            'description': presentation_data.get('Description'),              'formats': formats, -            'upload_date': upload_date, -            'duration': duration, +            'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), +            'duration': int_or_none(presentation_data.get('Duration'), 1000),          } diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py new file mode 100644 index 000000000..d3aba58a2 --- /dev/null +++ b/youtube_dl/extractor/sixplay.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    qualities, +    int_or_none, +    mimetype2ext, +    determine_ext, +) + + +class SixPlayIE(InfoExtractor): +    _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320', +        'md5': '42310bffe4ba3982db112b9cd3467328', +        'info_dict': { +            'id': '11495320', +            'ext': 'mp4', +            'title': 'Jamel et ses amis au Marrakech du rire 2015', +            'description': 'md5:ba2149d5c321d5201b78070ee839d872', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        clip_data = self._download_json( +            'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id, +            video_id) +        video_data = clip_data['videoInfo'] + +        quality_key = qualities(['lq', 'sd', 'hq', 'hd']) +        formats = [] +        for source in clip_data['sources']: +            source_type, source_url = source.get('type'), source.get('src') +            if not source_url or source_type == 'hls/primetime': +                continue +            ext = mimetype2ext(source_type) or determine_ext(source_url) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    source_url, video_id, 'mp4', 'm3u8_native', +                    m3u8_id='hls', fatal=False)) +                formats.extend(self._extract_f4m_formats( +                    source_url.replace('.m3u8', '.f4m'), +                    video_id, f4m_id='hds', fatal=False)) +            elif ext == 'mp4': +                quality = source.get('quality') +                formats.append({ +                    'url': source_url, +                    'format_id': quality, +                    'quality': quality_key(quality), +                    'ext': ext, +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_data['title'].strip(), +            'description': video_data.get('description'), +            'duration': int_or_none(video_data.get('duration')), +            'series': video_data.get('titlePgm'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py index 05e1b02ad..fffc9aa22 100644 --- a/youtube_dl/extractor/skynewsarabia.py +++ b/youtube_dl/extractor/skynewsarabia.py @@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):  class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): -    IE_NAME = 'skynewsarabia:video' +    IE_NAME = 'skynewsarabia:article'      _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'      _TESTS = [{          'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py new file mode 100644 index 000000000..9dc78c7d2 --- /dev/null +++ b/youtube_dl/extractor/skysports.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkySportsIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', +        'md5': 'c44a1db29f27daf9a0003e010af82100', +        'info_dict': { +            'id': '10328419', +            'ext': 'flv', +            'title': 'Bale: Its our time to shine', +            'description': 'md5:9fd1de3614d525f5addda32ac3c482c9', +        }, +        'add_ie': ['Ooyala'], +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'url': 'ooyala:%s' % self._search_regex( +                r'data-video-id="([^"]+)"', webpage, 'ooyala id'), +            'title': self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +            'ie_key': 'Ooyala', +        } diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 0b717a1e4..4967c1b77 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -9,6 +9,7 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    get_element_by_id,  ) @@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor):          bucket = info['jsplayer']['video_bucket']          ext = info['jsplayer']['video_extension']          video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) -        description = self._html_search_regex( +        description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(              r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,              'description', fatal=False) @@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor):              'ext': ext,              'url': video_url,              'thumbnail': info['slideshow']['pin_image_url'], -            'description': description, +            'description': description.strip() if description else None,          } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 49e5d09ae..72fe66142 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,10 +8,7 @@ from ..compat import (      compat_str,      compat_urllib_parse_urlencode,  ) -from ..utils import ( -    ExtractorError, -    sanitized_Request, -) +from ..utils import ExtractorError  class SohuIE(InfoExtractor): @@ -96,15 +93,10 @@ class SohuIE(InfoExtractor):              else:                  base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' -            req = sanitized_Request(base_data_url + vid_id) - -            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') -            if cn_verification_proxy: -                req.add_header('Ytdl-request-proxy', cn_verification_proxy) -              return self._download_json( -                req, video_id, -                'Downloading JSON data for %s' % vid_id) +                base_data_url + vid_id, video_id, +                'Downloading JSON data for %s' % vid_id, +                headers=self.geo_verification_headers())          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 39a7aaf9d..3c552807e 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,8 +4,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urlparse  from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( +    extract_attributes, +    unified_strdate, +    get_element_by_attribute, +)  class SpiegelIE(InfoExtractor): @@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor):              'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',              'description': 'md5:8029d8310232196eb235d27575a8b9f4',              'duration': 49, +            'upload_date': '20130311',          },      }, {          'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', @@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor):              'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',              'description': 'md5:c2322b65e58f385a820c10fa03b2d088',              'duration': 983, +            'upload_date': '20131115',          },      }, {          'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', @@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor):              'ext': 'mp4',              'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',              'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', +            'upload_date': '20140904',          }      }, {          'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor):          if SpiegeltvIE.suitable(handle.geturl()):              return self.url_result(handle.geturl(), 'Spiegeltv') -        title = re.sub(r'\s+', ' ', self._html_search_regex( -            r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', -            webpage, 'title')) -        description = self._html_search_meta('description', webpage, 'description') +        video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) + +        title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) +        description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description')          base_url = self._search_regex(              [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], @@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor):          return {              'id': video_id,              'title': title, -            'description': description, +            'description': description.strip() if description else None,              'duration': duration, +            'upload_date': unified_strdate(video_data.get('data-video-date')),              'formats': formats,          } @@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor):              'ext': 'mp4',              'title': 'Faszination Badminton: Nennt es bloß nicht Federball',              'description': 're:^Patrick Kämnitz gehört.{100,}', +            'upload_date': '20140825',          },      }, {          'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 74d01183f..409d50304 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -9,8 +9,9 @@ from ..utils import (  class SRMediathekIE(ARDMediathekIE): +    IE_NAME = 'sr:mediathek'      IE_DESC = 'Saarländischer Rundfunk' -    _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' +    _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'      _TESTS = [{          'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', @@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE):              # m3u8 download              'skip_download': True,          }, -        'expected_warnings': ['Unable to download f4m manifest'] +    }, { +        'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index d5c852f52..0f8782d03 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor):          episode = self._parse_json(              js_to_json(self._search_regex( -                r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), +                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),              display_id)['config']['episode']          title = unescapeHTML(episode['title']) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 6526a6345..1c04dfb7b 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -7,13 +7,13 @@ from .common import InfoExtractor  from ..utils import (      determine_ext,      dict_get, +    int_or_none, +    try_get,  )  class SVTBaseIE(InfoExtractor): -    def _extract_video(self, info, video_id): -        video_info = self._get_video_info(info) - +    def _extract_video(self, video_info, video_id):          formats = []          for vr in video_info['videoReferences']:              player_type = vr.get('playerType') @@ -37,6 +37,8 @@ class SVTBaseIE(InfoExtractor):                      'format_id': player_type,                      'url': vurl,                  }) +        if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): +            self.raise_geo_restricted('This video is only available in Sweden')          self._sort_formats(formats)          subtitles = {} @@ -52,15 +54,32 @@ class SVTBaseIE(InfoExtractor):                      subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) -        duration = video_info.get('materialLength') -        age_limit = 18 if video_info.get('inappropriateForChildren') else 0 +        title = video_info.get('title') + +        series = video_info.get('programTitle') +        season_number = int_or_none(video_info.get('season')) +        episode = video_info.get('episodeTitle') +        episode_number = int_or_none(video_info.get('episodeNumber')) + +        duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) +        age_limit = None +        adult = dict_get( +            video_info, ('inappropriateForChildren', 'blockedForChildren'), +            skip_false_values=False) +        if adult is not None: +            age_limit = 18 if adult else 0          return {              'id': video_id, +            'title': title,              'formats': formats,              'subtitles': subtitles,              'duration': duration,              'age_limit': age_limit, +            'series': series, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number,          } @@ -85,9 +104,6 @@ class SVTIE(SVTBaseIE):          if mobj:              return mobj.group('url') -    def _get_video_info(self, info): -        return info['video'] -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          widget_id = mobj.group('widget_id') @@ -97,15 +113,15 @@ class SVTIE(SVTBaseIE):              'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),              article_id) -        info_dict = self._extract_video(info, article_id) +        info_dict = self._extract_video(info['video'], article_id)          info_dict['title'] = info['context']['title']          return info_dict  class SVTPlayIE(SVTBaseIE):      IE_DESC = 'SVT Play and Öppet arkiv' -    _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' +    _TESTS = [{          'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',          'md5': '2b6704fe4a28801e1a098bbf3c5ac611',          'info_dict': { @@ -121,25 +137,50 @@ class SVTPlayIE(SVTBaseIE):                  }]              },          }, -    } - -    def _get_video_info(self, info): -        return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'] +    }, { +        # geo restricted to Sweden +        'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', +        'only_matching': True, +    }, { +        'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        data = self._parse_json(self._search_regex( -            r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id) +        data = self._parse_json( +            self._search_regex( +                r'root\["__svtplay"\]\s*=\s*([^;]+);', +                webpage, 'embedded data', default='{}'), +            video_id, fatal=False)          thumbnail = self._og_search_thumbnail(webpage) -        info_dict = self._extract_video(data, video_id) -        info_dict.update({ -            'title': data['context']['dispatcher']['stores']['MetaStore']['title'], -            'thumbnail': thumbnail, -        }) - -        return info_dict +        if data: +            video_info = try_get( +                data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], +                dict) +            if video_info: +                info_dict = self._extract_video(video_info, video_id) +                info_dict.update({ +                    'title': data['context']['dispatcher']['stores']['MetaStore']['title'], +                    'thumbnail': thumbnail, +                }) +                return info_dict + +        video_id = self._search_regex( +            r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', +            webpage, 'video id', default=None) + +        if video_id: +            data = self._download_json( +                'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) +            info_dict = self._extract_video(data, video_id) +            if not info_dict.get('title'): +                info_dict['title'] = re.sub( +                    r'\s*\|\s*.+?$', '', +                    info_dict.get('episode') or self._og_search_title(webpage)) +            return info_dict diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 4b4b740b4..2ecfd0405 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,50 +1,41 @@  # coding: utf-8  from __future__ import unicode_literals -import json +from .mitele import MiTeleBaseIE -from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse_unquote, -    compat_urllib_parse_urlencode, -    compat_urlparse, -) -from ..utils import ( -    get_element_by_attribute, -    parse_duration, -    strip_jsonp, -) - -class TelecincoIE(InfoExtractor): +class TelecincoIE(MiTeleBaseIE):      IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'      _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'      _TESTS = [{          'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', -        'md5': '5cbef3ad5ef17bf0d21570332d140729', +        'md5': '8d7b2d5f699ee2709d992a63d5cd1712',          'info_dict': { -            'id': 'MDSVID20141015_0058', +            'id': 'JEA5ijCnF6p5W08A1rNKn7',              'ext': 'mp4', -            'title': 'Con Martín Berasategui, hacer un bacalao al ...', +            'title': 'Bacalao con kokotxas al pil-pil', +            'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',              'duration': 662,          },      }, {          'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', -        'md5': '0a5b9f3cc8b074f50a0578f823a12694', +        'md5': '284393e5387b3b947b77c613ef04749a',          'info_dict': { -            'id': 'MDSVID20150916_0128', +            'id': 'jn24Od1zGLG4XUZcnUnZB6',              'ext': 'mp4', -            'title': '¿Quién es este ex futbolista con el que hablan ...', +            'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?', +            'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',              'duration': 79,          },      }, {          'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', -        'md5': 'ad1bfaaba922dd4a295724b05b68f86a', +        'md5': '749afab6ea5a136a8806855166ae46a2',          'info_dict': { -            'id': 'MDSVID20150513_0220', +            'id': 'aywerkD2Sv1vGNqq9b85Q2',              'ext': 'mp4',              'title': '#DOYLACARA. Con la trata no hay trato', +            'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',              'duration': 50,          },      }, { @@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor):      }]      def _real_extract(self, url): -        episode = self._match_id(url) -        webpage = self._download_webpage(url, episode) -        embed_data_json = self._search_regex( -            r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', -        ).replace('\'', '"') -        embed_data = json.loads(embed_data_json) - -        domain = embed_data['mediaUrl'] -        if not domain.startswith('http'): -            # only happens in telecinco.es videos -            domain = 'http://' + domain -        info_url = compat_urlparse.urljoin( -            domain, -            compat_urllib_parse_unquote(embed_data['flashvars']['host']) -        ) -        info_el = self._download_xml(info_url, episode).find('./video/info') - -        video_link = info_el.find('videoUrl/link').text -        token_query = compat_urllib_parse_urlencode({'id': video_link}) -        token_info = self._download_json( -            embed_data['flashvars']['ov_tk'] + '?' + token_query, -            episode, -            transform_source=strip_jsonp -        ) -        formats = self._extract_m3u8_formats( -            token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') -        self._sort_formats(formats) - -        return { -            'id': embed_data['videoId'], -            'display_id': episode, -            'title': info_el.find('title').text, -            'formats': formats, -            'description': get_element_by_attribute('class', 'text', webpage), -            'thumbnail': info_el.find('thumb').text, -            'duration': parse_duration(info_el.find('duration').text), -        } +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        title = self._html_search_meta( +            ['og:title', 'twitter:title'], webpage, 'title') +        info = self._get_player_info(url, webpage) +        info.update({ +            'display_id': display_id, +            'title': title, +            'description': self._html_search_meta( +                ['og:description', 'twitter:description'], +                webpage, 'title', fatal=False), +        }) +        return info diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6c848dc6f..e595c4a69 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          wat_id = self._html_search_regex( -            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1', +            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',              webpage, 'wat id', group='id')          return self.url_result('wat:%s' % wat_id, 'Wat') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 07d222ae3..bb3efc4ea 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,6 +6,7 @@ import time  import hmac  import binascii  import hashlib +import netrc  from .once import OnceIE @@ -24,6 +25,9 @@ from ..utils import (      xpath_with_ns,      mimetype2ext,      find_xpath_attr, +    unescapeHTML, +    urlencode_postdata, +    unified_timestamp,  )  default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -62,10 +66,11 @@ class ThePlatformBaseIE(OnceIE):          return formats, subtitles -    def get_metadata(self, path, video_id): +    def _download_theplatform_metadata(self, path, video_id):          info_url = 'http://link.theplatform.com/s/%s?format=preview' % path -        info = self._download_json(info_url, video_id) +        return self._download_json(info_url, video_id) +    def _parse_theplatform_metadata(self, info):          subtitles = {}          captions = info.get('captions')          if isinstance(captions, list): @@ -86,6 +91,10 @@ class ThePlatformBaseIE(OnceIE):              'uploader': info.get('billingCode'),          } +    def _extract_theplatform_metadata(self, path, video_id): +        info = self._download_theplatform_metadata(path, video_id) +        return self._parse_theplatform_metadata(info) +  class ThePlatformIE(ThePlatformBaseIE):      _VALID_URL = r'''(?x) @@ -158,6 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE):          'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',          'only_matching': True,      }] +    _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'      @classmethod      def _extract_urls(cls, webpage): @@ -192,6 +202,96 @@ class ThePlatformIE(ThePlatformBaseIE):          sig = flags + expiration_date + checksum + str_to_hex(sig_secret)          return '%s&sig=%s' % (url, sig) +    def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): +        def xml_text(xml_str, tag): +            return self._search_regex( +                '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) + +        mvpd_headers = { +            'ap_42': 'anonymous', +            'ap_11': 'Linux i686', +            'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', +            'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', +        } + +        guid = xml_text(resource, 'guid') +        requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} +        authn_token = requestor_info.get('authn_token') +        if authn_token: +            token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) +            if token_expires and token_expires >= time.time(): +                authn_token = None +        if not authn_token: +            # TODO add support for other TV Providers +            mso_id = 'DTV' +            login_info = netrc.netrc().authenticators(mso_id) +            if not login_info: +                return None + +            def post_form(form_page, note, data={}): +                post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') +                return self._download_webpage( +                    post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ +                        'Content-Type': 'application/x-www-form-urlencoded', +                    }) + +            provider_redirect_page = self._download_webpage( +                self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, +                'Downloading Provider Redirect Page', query={ +                    'noflash': 'true', +                    'mso_id': mso_id, +                    'requestor_id': requestor_id, +                    'no_iframe': 'false', +                    'domain_name': 'adobe.com', +                    'redirect_url': url, +                }) +            provider_login_page = post_form( +                provider_redirect_page, 'Downloading Provider Login Page') +            mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { +                'username': login_info[0], +                'password': login_info[2], +            }) +            post_form(mvpd_confirm_page, 'Confirming Login') + +            session = self._download_webpage( +                self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, +                'Retrieving Session', data=urlencode_postdata({ +                    '_method': 'GET', +                    'requestor_id': requestor_id, +                }), headers=mvpd_headers) +            authn_token = unescapeHTML(xml_text(session, 'authnToken')) +            requestor_info['authn_token'] = authn_token +            self._downloader.cache.store('mvpd', requestor_id, requestor_info) + +        authz_token = requestor_info.get(guid) +        if not authz_token: +            authorize = self._download_webpage( +                self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, +                'Retrieving Authorization Token', data=urlencode_postdata({ +                    'resource_id': resource, +                    'requestor_id': requestor_id, +                    'authentication_token': authn_token, +                    'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), +                    'userMeta': '1', +                }), headers=mvpd_headers) +            authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) +            requestor_info[guid] = authz_token +            self._downloader.cache.store('mvpd', requestor_id, requestor_info) + +        mvpd_headers.update({ +            'ap_19': xml_text(authn_token, 'simpleSamlNameID'), +            'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), +        }) + +        return self._download_webpage( +            self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', +            video_id, 'Retrieving Media Token', data=urlencode_postdata({ +                'authz_token': authz_token, +                'requestor_id': requestor_id, +                'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), +                'hashed_guid': 'false', +            }), headers=mvpd_headers) +      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -265,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE):          formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)          self._sort_formats(formats) -        ret = self.get_metadata(path, video_id) +        ret = self._extract_theplatform_metadata(path, video_id)          combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)          ret.update({              'id': video_id, @@ -339,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):          timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)          categories = [item['media$name'] for item in entry.get('media$categories', [])] -        ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) +        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)          subtitles = self._merge_subtitles(subtitles, ret['subtitles'])          ret.update({              'id': video_id, diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py index c77a07989..a0bc12c81 100644 --- a/youtube_dl/extractor/threeqsdn.py +++ b/youtube_dl/extractor/threeqsdn.py @@ -92,12 +92,11 @@ class ThreeQSDNIE(InfoExtractor):              if not item_url or item_url in urls:                  return              urls.add(item_url) -            type_ = item.get('type') -            ext = determine_ext(item_url, default_ext=None) -            if type_ == 'application/dash+xml' or ext == 'mpd': +            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) +            if ext == 'mpd':                  formats.extend(self._extract_mpd_formats(                      item_url, video_id, mpd_id='mpd', fatal=False)) -            elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': +            elif ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats(                      item_url, video_id, 'mp4',                      entry_protocol='m3u8' if live else 'm3u8_native', @@ -111,7 +110,7 @@ class ThreeQSDNIE(InfoExtractor):                  formats.append({                      'url': item_url,                      'format_id': item.get('quality'), -                    'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, +                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,                      'vcodec': 'none' if stream_type == 'audio' else None,                  }) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 4797d1310..54c2d0aa6 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,74 +1,41 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -    unified_strdate, -) +from ..utils import int_or_none  class TouTvIE(InfoExtractor):      IE_NAME = 'tou.tv' -    _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' +    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)'      _TEST = { -        'url': 'http://www.tou.tv/30-vies/S04E41', +        'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',          'info_dict': { -            'id': '30-vies_S04E41', +            'id': '122017',              'ext': 'mp4', -            'title': '30 vies Saison 4 / Épisode 41', -            'description': 'md5:da363002db82ccbe4dafeb9cab039b09', -            'age_limit': 8, -            'uploader': 'Groupe des Nouveaux Médias', -            'duration': 1296, -            'upload_date': '20131118', -            'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', +            'title': 'Saison 2015 Épisode 17', +            'description': 'La photo de famille 2', +            'upload_date': '20100717',          },          'params': { -            'skip_download': True,  # Requires rtmpdump +            # m3u8 download +            'skip_download': True,          }, -        'skip': 'Only available in Canada'      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        webpage = self._download_webpage(url, video_id) - -        mediaId = self._search_regex( -            r'"idMedia":\s*"([^"]+)"', webpage, 'media ID') - -        streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId -        streams_doc = self._download_xml( -            streams_url, video_id, note='Downloading stream list') - -        video_url = next(n.text -                         for n in streams_doc.findall('.//choice/url') -                         if '//ad.doubleclick' not in n.text) -        if video_url.endswith('/Unavailable.flv'): -            raise ExtractorError( -                'Access to this video is blocked from outside of Canada', -                expected=True) - -        duration_str = self._html_search_meta( -            'video:duration', webpage, 'duration') -        duration = int(duration_str) if duration_str else None -        upload_date_str = self._html_search_meta( -            'video:release_date', webpage, 'upload date') -        upload_date = unified_strdate(upload_date_str) if upload_date_str else None +        path = self._match_id(url) +        metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) +        video_id = metadata['IdMedia'] +        details = metadata['Details'] +        title = details['OriginalTitle']          return { +            '_type': 'url_transparent', +            'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id),              'id': video_id, -            'title': self._og_search_title(webpage), -            'url': video_url, -            'description': self._og_search_description(webpage), -            'uploader': self._dc_search_uploader(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), -            'age_limit': self._media_rating_search(webpage), -            'duration': duration, -            'upload_date': upload_date, -            'ext': 'mp4', +            'title': title, +            'thumbnail': details.get('ImageUrl'), +            'duration': int_or_none(details.get('LengthInSeconds')),          } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index a4997cb89..5070082da 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -4,6 +4,12 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    clean_html, +    get_element_by_attribute, +    ExtractorError, +)  class TVPIE(InfoExtractor): @@ -21,7 +27,7 @@ class TVPIE(InfoExtractor):          },      }, {          'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', -        'md5': 'c3b15ed1af288131115ff17a17c19dda', +        'md5': 'b0005b542e5b4de643a9690326ab1257',          'info_dict': {              'id': '17916176',              'ext': 'mp4', @@ -53,6 +59,11 @@ class TVPIE(InfoExtractor):          webpage = self._download_webpage(              'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) +        error_massage = get_element_by_attribute('class', 'msg error', webpage) +        if error_massage: +            raise ExtractorError('%s said: %s' % ( +                self.IE_NAME, clean_html(error_massage)), expected=True) +          title = self._search_regex(              r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',              webpage, 'title', group='title') @@ -66,24 +77,50 @@ class TVPIE(InfoExtractor):              r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)          video_url = self._search_regex( -            r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) -        if not video_url: +            r'0:{src:([\'"])(?P<url>.*?)\1', webpage, +            'formats', group='url', default=None) +        if not video_url or 'material_niedostepny.mp4' in video_url:              video_url = self._download_json(                  'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,                  video_id)['video_url'] -        ext = video_url.rsplit('.', 1)[-1] -        if ext != 'ism/manifest': -            if '/' in ext: -                ext = 'mp4' +        formats = [] +        video_url_base = self._search_regex( +            r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', +            video_url, 'video base url', default=None) +        if video_url_base: +            # TODO: Current DASH formats are broken - $Time$ pattern in +            # <SegmentTemplate> not implemented yet +            # formats.extend(self._extract_mpd_formats( +            #     video_url_base + '.ism/video.mpd', +            #     video_id, mpd_id='dash', fatal=False)) +            formats.extend(self._extract_f4m_formats( +                video_url_base + '.ism/video.f4m', +                video_id, f4m_id='hds', fatal=False)) +            m3u8_formats = self._extract_m3u8_formats( +                video_url_base + '.ism/video.m3u8', video_id, +                'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +            self._sort_formats(m3u8_formats) +            m3u8_formats = list(filter( +                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', +                m3u8_formats)) +            formats.extend(m3u8_formats) +            for i, m3u8_format in enumerate(m3u8_formats, 2): +                http_url = '%s-%d.mp4' % (video_url_base, i) +                if self._is_valid_url(http_url, video_id): +                    f = m3u8_format.copy() +                    f.update({ +                        'url': http_url, +                        'format_id': f['format_id'].replace('hls', 'http'), +                        'protocol': 'http', +                    }) +                    formats.append(f) +        else:              formats = [{                  'format_id': 'direct',                  'url': video_url, -                'ext': ext, +                'ext': determine_ext(video_url, 'mp4'),              }] -        else: -            m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) -            formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')          self._sort_formats(formats) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index f3198fb85..7a9386cde 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -1,25 +1,62 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    determine_ext, +    mimetype2ext, +)  class TweakersIE(InfoExtractor):      _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'      _TEST = {          'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', -        'md5': '3147e4ddad366f97476a93863e4557c8', +        'md5': 'fe73e417c093a788e0160c4025f88b15',          'info_dict': {              'id': '9926',              'ext': 'mp4',              'title': 'New Nintendo 3DS XL - Op alle fronten beter', -            'description': 'md5:f97324cc71e86e11c853f0763820e3ba', +            'description': 'md5:3789b21fed9c0219e9bcaacd43fab280',              'thumbnail': 're:^https?://.*\.jpe?g$',              'duration': 386, +            'uploader_id': 's7JeEm',          }      }      def _real_extract(self, url): -        playlist_id = self._match_id(url) -        entries = self._extract_xspf_playlist( -            'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id) -        return self.playlist_result(entries, playlist_id) +        video_id = self._match_id(url) +        video_data = self._download_json( +            'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id, +            video_id)['items'][0] + +        title = video_data['title'] + +        formats = [] +        for location in video_data.get('locations', {}).get('progressive', []): +            format_id = location.get('label') +            width = int_or_none(location.get('width')) +            height = int_or_none(location.get('height')) +            for source in location.get('sources', []): +                source_url = source.get('src') +                if not source_url: +                    continue +                ext = mimetype2ext(source.get('type')) or determine_ext(source_url) +                formats.append({ +                    'format_id': format_id, +                    'url': source_url, +                    'width': width, +                    'height': height, +                    'ext': ext, +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': video_data.get('description'), +            'thumbnail': video_data.get('poster'), +            'duration': int_or_none(video_data.get('duration')), +            'uploader_id': video_data.get('account'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 20919774d..67b1277cc 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor):      _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'      _API_BASE = 'https://api.twitch.tv' -    _USHER_BASE = 'http://usher.twitch.tv' +    _USHER_BASE = 'https://usher.ttvnw.net'      _LOGIN_URL = 'http://www.twitch.tv/login'      _NETRC_MACHINE = 'twitch' diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py new file mode 100644 index 000000000..ce3bf6b02 --- /dev/null +++ b/youtube_dl/extractor/urplay.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class URPlayIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde', +        'md5': '15ca67b63fd8fb320ac2bcd854bad7b6', +        'info_dict': { +            'id': '190031', +            'ext': 'mp4', +            'title': 'Tripp, Trapp, Träd : Sovkudde', +            'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        urplayer_data = self._parse_json(self._search_regex( +            r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) +        host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + +        formats = [] +        for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): +            file_rtmp = urplayer_data.get('file_rtmp' + quality_attr) +            if file_rtmp: +                formats.append({ +                    'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp), +                    'format_id': quality + '-rtmp', +                    'ext': 'flv', +                    'preference': preference, +                }) +            file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) +            if file_http: +                file_http_base_url = 'http://%s/%s' % (host, file_http) +                formats.extend(self._extract_f4m_formats( +                    file_http_base_url + 'manifest.f4m', video_id, +                    preference, '%s-hds' % quality, fatal=False)) +                formats.extend(self._extract_m3u8_formats( +                    file_http_base_url + 'playlist.m3u8', video_id, 'mp4', +                    'm3u8_native', preference, '%s-hls' % quality, fatal=False)) +        self._sort_formats(formats) + +        subtitles = {} +        for subtitle in urplayer_data.get('subtitles', []): +            subtitle_url = subtitle.get('file') +            kind = subtitle.get('kind') +            if subtitle_url or kind and kind != 'captions': +                continue +            subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ +                'url': subtitle_url, +            }) + +        return { +            'id': video_id, +            'title': urplayer_data['title'], +            'description': self._og_search_description(webpage), +            'thumbnail': urplayer_data.get('image'), +            'series': urplayer_data.get('series_title'), +            'subtitles': subtitles, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py new file mode 100644 index 000000000..e7ac5a842 --- /dev/null +++ b/youtube_dl/extractor/vidbit.py @@ -0,0 +1,84 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( +    int_or_none, +    js_to_json, +    remove_end, +    unified_strdate, +) + + +class VidbitIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)' +    _TESTS = [{ +        'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', +        'md5': '1a34b7f14defe3b8fafca9796892924d', +        'info_dict': { +            'id': 'jkL2yDOEq2', +            'ext': 'mp4', +            'title': 'Intro to VidBit', +            'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', +            'thumbnail': 're:https?://.*\.jpg$', +            'upload_date': '20160618', +            'view_count': int, +            'comment_count': int, +        } +    }, { +        'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) + +        video_url, title = [None] * 2 + +        config = self._parse_json(self._search_regex( +            r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), +            video_id, transform_source=js_to_json) +        if config: +            if config.get('file'): +                video_url = compat_urlparse.urljoin(url, config['file']) +            title = config.get('title') + +        if not video_url: +            video_url = compat_urlparse.urljoin(url, self._search_regex( +                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', +                webpage, 'video URL', group='url')) + +        if not title: +            title = remove_end( +                self._html_search_regex( +                    (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'), +                    webpage, 'title', default=None) or self._og_search_title(webpage), +                ' - VidBit') + +        description = self._html_search_meta( +            ('description', 'og:description', 'twitter:description'), +            webpage, 'description') + +        upload_date = unified_strdate(self._html_search_meta( +            'datePublished', webpage, 'upload date')) + +        view_count = int_or_none(self._search_regex( +            r'<strong>(\d+)</strong> views', +            webpage, 'view count', fatal=False)) +        comment_count = int_or_none(self._search_regex( +            r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', +            webpage, 'comment count', fatal=False)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'thumbnail': self._og_search_thumbnail(webpage), +            'upload_date': upload_date, +            'view_count': view_count, +            'comment_count': comment_count, +        } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c52986af6..7e854f326 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import (      ExtractorError,      InAdvancePagedList,      int_or_none, +    NO_DEFAULT,      RegexNotFoundError,      sanitized_Request,      smuggle_url, @@ -56,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):          self._set_vimeo_cookie('vuid', vuid)          self._download_webpage(login_request, None, False, 'Wrong login info') +    def _verify_video_password(self, url, video_id, webpage): +        password = self._downloader.params.get('videopassword') +        if password is None: +            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) +        token, vuid = self._extract_xsrft_and_vuid(webpage) +        data = urlencode_postdata({ +            'password': password, +            'token': token, +        }) +        if url.startswith('http://'): +            # vimeo only supports https now, but the user can give an http url +            url = url.replace('http://', 'https://') +        password_request = sanitized_Request(url + '/password', data) +        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') +        password_request.add_header('Referer', url) +        self._set_vimeo_cookie('vuid', vuid) +        return self._download_webpage( +            password_request, video_id, +            'Verifying the password', 'Wrong password') +      def _extract_xsrft_and_vuid(self, webpage):          xsrft = self._search_regex(              r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', @@ -146,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                              \.                          )?                          vimeo(?P<pro>pro)?\.com/ -                        (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/) +                        (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)                          (?:.*?/)?                          (?:                              (?: @@ -227,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor):          {              'url': 'http://vimeo.com/channels/keypeele/75629013',              'md5': '2f86a05afe9d7abc0b9126d229bbe15d', -            'note': 'Video is freely available via original URL ' -                    'and protected with password when accessed via http://vimeo.com/75629013',              'info_dict': {                  'id': '75629013',                  'ext': 'mp4', @@ -272,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor):          {              # contains original format              'url': 'https://vimeo.com/33951933', -            'md5': '53c688fa95a55bf4b7293d37a89c5c53', +            'md5': '2d9f5475e0537f013d0073e812ab89e6',              'info_dict': {                  'id': '33951933',                  'ext': 'mp4', @@ -285,6 +304,29 @@ class VimeoIE(VimeoBaseInfoExtractor):              },          },          { +            # only available via https://vimeo.com/channels/tributes/6213729 and +            # not via https://vimeo.com/6213729 +            'url': 'https://vimeo.com/channels/tributes/6213729', +            'info_dict': { +                'id': '6213729', +                'ext': 'mp4', +                'title': 'Vimeo Tribute: The Shining', +                'uploader': 'Casey Donahue', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', +                'uploader_id': 'caseydonahue', +                'upload_date': '20090821', +                'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', +            }, +            'params': { +                'skip_download': True, +            }, +            'expected_warnings': ['Unable to download JSON metadata'], +        }, +        { +            'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', +            'only_matching': True, +        }, +        {              'url': 'https://vimeo.com/109815029',              'note': 'Video not completely processed, "failed" seed status',              'only_matching': True, @@ -294,6 +336,10 @@ class VimeoIE(VimeoBaseInfoExtractor):              'only_matching': True,          },          { +            'url': 'https://vimeo.com/album/2632481/video/79010983', +            'only_matching': True, +        }, +        {              # source file returns 403: Forbidden              'url': 'https://vimeo.com/7809605',              'only_matching': True, @@ -318,26 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor):              r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)          if mobj:              return mobj.group(1) - -    def _verify_video_password(self, url, video_id, webpage): -        password = self._downloader.params.get('videopassword') -        if password is None: -            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) -        token, vuid = self._extract_xsrft_and_vuid(webpage) -        data = urlencode_postdata({ -            'password': password, -            'token': token, -        }) -        if url.startswith('http://'): -            # vimeo only supports https now, but the user can give an http url -            url = url.replace('http://', 'https://') -        password_request = sanitized_Request(url + '/password', data) -        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') -        password_request.add_header('Referer', url) -        self._set_vimeo_cookie('vuid', vuid) -        return self._download_webpage( -            password_request, video_id, -            'Verifying the password', 'Wrong password') +        # Look more for non-standard embedded Vimeo player +        mobj = re.search( +            r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) +        if mobj: +            return mobj.group('url')      def _verify_player_video_password(self, url, video_id):          password = self._downloader.params.get('videopassword') @@ -369,7 +400,7 @@ class VimeoIE(VimeoBaseInfoExtractor):          orig_url = url          if mobj.group('pro') or mobj.group('player'):              url = 'https://player.vimeo.com/video/' + video_id -        else: +        elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information @@ -630,8 +661,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):                  webpage = self._login_list_password(page_url, list_id, webpage)                  yield self._extract_list_title(webpage) -            for video_id in re.findall(r'id="clip_(\d+?)"', webpage): -                yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') +            # Try extracting href first since not all videos are available via +            # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) +            clips = re.findall( +                r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage) +            if clips: +                for video_id, video_url in clips: +                    yield self.url_result( +                        compat_urlparse.urljoin(base_url, video_url), +                        VimeoIE.ie_key(), video_id=video_id) +            # More relaxed fallback +            else: +                for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): +                    yield self.url_result( +                        'https://vimeo.com/%s' % video_id, +                        VimeoIE.ie_key(), video_id=video_id)              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:                  break @@ -668,7 +712,7 @@ class VimeoUserIE(VimeoChannelIE):  class VimeoAlbumIE(VimeoChannelIE):      IE_NAME = 'vimeo:album' -    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)' +    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'      _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'      _TESTS = [{          'url': 'https://vimeo.com/album/2632481', @@ -688,6 +732,13 @@ class VimeoAlbumIE(VimeoChannelIE):          'params': {              'videopassword': 'youtube-dl',          } +    }, { +        'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail', +        'only_matching': True, +    }, { +        # TODO: respect page number +        'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail', +        'only_matching': True,      }]      def _page_url(self, base_url, pagenum): @@ -746,12 +797,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):              'thumbnail': 're:^https?://.*\.jpg$',              'uploader_id': 'user22258446',          } +    }, { +        'note': 'Password protected', +        'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', +        'info_dict': { +            'id': '138823582', +            'ext': 'mp4', +            'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', +            'uploader': 'TMB', +            'uploader_id': 'user37284429', +        }, +        'params': { +            'videopassword': 'holygrail', +        },      }] +    def _real_initialize(self): +        self._login() + +    def _get_config_url(self, webpage_url, video_id, video_password_verified=False): +        webpage = self._download_webpage(webpage_url, video_id) +        config_url = self._html_search_regex( +            r'data-config-url="([^"]+)"', webpage, 'config URL', +            default=NO_DEFAULT if video_password_verified else None) +        if config_url is None: +            self._verify_video_password(webpage_url, video_id, webpage) +            config_url = self._get_config_url( +                webpage_url, video_id, video_password_verified=True) +        return config_url +      def _real_extract(self, url):          video_id = self._match_id(url) -        config = self._download_json( -            'https://player.vimeo.com/video/%s/config' % video_id, video_id) +        config_url = self._get_config_url(url, video_id) +        config = self._download_json(config_url, video_id)          info_dict = self._parse_config(config, video_id)          self._vimeo_sort_formats(info_dict['formats'])          info_dict['id'] = video_id diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index a6a6cc479..0183f052a 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,7 @@ class VineIE(InfoExtractor):              'upload_date': '20130519',              'uploader': 'Jack Dorsey',              'uploader_id': '76', +            'view_count': int,              'like_count': int,              'comment_count': int,              'repost_count': int, @@ -39,6 +40,7 @@ class VineIE(InfoExtractor):              'upload_date': '20140815',              'uploader': 'Mars Ruiz',              'uploader_id': '1102363502380728320', +            'view_count': int,              'like_count': int,              'comment_count': int,              'repost_count': int, @@ -54,6 +56,7 @@ class VineIE(InfoExtractor):              'upload_date': '20130430',              'uploader': 'Z3k3',              'uploader_id': '936470460173008896', +            'view_count': int,              'like_count': int,              'comment_count': int,              'repost_count': int, @@ -71,6 +74,7 @@ class VineIE(InfoExtractor):              'upload_date': '20150705',              'uploader': 'Pimry_zaa',              'uploader_id': '1135760698325307392', +            'view_count': int,              'like_count': int,              'comment_count': int,              'repost_count': int, @@ -86,10 +90,12 @@ class VineIE(InfoExtractor):          data = self._parse_json(              self._search_regex( -                r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id, +                r'window\.POST_DATA\s*=\s*({.+?});\s*</script>',                  webpage, 'vine data'),              video_id) +        data = data[list(data.keys())[0]] +          formats = [{              'format_id': '%(format)s-%(rate)s' % f,              'vcodec': f.get('format'), @@ -109,6 +115,7 @@ class VineIE(InfoExtractor):              'upload_date': unified_strdate(data.get('created')),              'uploader': username,              'uploader_id': data.get('userIdStr'), +            'view_count': int_or_none(data.get('loops', {}).get('count')),              'like_count': int_or_none(data.get('likes', {}).get('count')),              'comment_count': int_or_none(data.get('comments', {}).get('count')),              'repost_count': int_or_none(data.get('reposts', {}).get('count')), diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 79c819bc3..758d9c86b 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  import re  import json +import sys  from .common import InfoExtractor  from ..compat import compat_str @@ -10,7 +11,6 @@ from ..utils import (      ExtractorError,      int_or_none,      orderedSet, -    sanitized_Request,      str_to_int,      unescapeHTML,      unified_strdate, @@ -27,12 +27,12 @@ class VKIE(InfoExtractor):                      https?://                          (?:                              (?: -                                (?:m\.)?vk\.com/video_| +                                (?:(?:m|new)\.)?vk\.com/video_|                                  (?:www\.)?daxab.com/                              )                              ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|                              (?: -                                (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| +                                (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|                                  (?:www\.)?daxab.com/embed/                              )                              (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? @@ -182,6 +182,10 @@ class VKIE(InfoExtractor):              # pladform embed              'url': 'https://vk.com/video-76116461_171554880',              'only_matching': True, +        }, +        { +            'url': 'http://new.vk.com/video205387401_165548505', +            'only_matching': True,          }      ] @@ -190,7 +194,7 @@ class VKIE(InfoExtractor):          if username is None:              return -        login_page = self._download_webpage( +        login_page, url_handle = self._download_webpage_handle(              'https://vk.com', None, 'Downloading login page')          login_form = self._hidden_inputs(login_page) @@ -200,11 +204,26 @@ class VKIE(InfoExtractor):              'pass': password.encode('cp1251'),          }) -        request = sanitized_Request( -            'https://login.vk.com/?act=login', -            urlencode_postdata(login_form)) +        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header +        # and expects the first one to be set rather than second (see +        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). +        # As of RFC6265 the newer one cookie should be set into cookie store +        # what actually happens. +        # We will workaround this VK issue by resetting the remixlhk cookie to +        # the first one manually. +        cookies = url_handle.headers.get('Set-Cookie') +        if sys.version_info[0] >= 3: +            cookies = cookies.encode('iso-8859-1') +        cookies = cookies.decode('utf-8') +        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) +        if remixlhk: +            value, domain = remixlhk.groups() +            self._set_cookie(domain, 'remixlhk', value) +          login_page = self._download_webpage( -            request, None, note='Logging in as %s' % username) +            'https://login.vk.com/?act=login', None, +            note='Logging in as %s' % username, +            data=urlencode_postdata(login_form))          if re.search(r'onLoginFailed', login_page):              raise ExtractorError( @@ -339,7 +358,7 @@ class VKIE(InfoExtractor):  class VKUserVideosIE(InfoExtractor):      IE_NAME = 'vk:uservideos'      IE_DESC = "VK - User's Videos" -    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' +    _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'      _TEMPLATE_URL = 'https://vk.com/videos'      _TESTS = [{          'url': 'http://vk.com/videos205387401', @@ -354,6 +373,12 @@ class VKUserVideosIE(InfoExtractor):      }, {          'url': 'http://vk.com/videos-97664626?section=all',          'only_matching': True, +    }, { +        'url': 'http://m.vk.com/videos205387401', +        'only_matching': True, +    }, { +        'url': 'http://new.vk.com/videos205387401', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 8e35f24e8..bec7ab327 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -25,7 +25,8 @@ class VRTIE(InfoExtractor):                  'timestamp': 1414271750.949,                  'upload_date': '20141025',                  'duration': 929, -            } +            }, +            'skip': 'HTTP Error 404: Not Found',          },          # sporza.be          { @@ -39,7 +40,8 @@ class VRTIE(InfoExtractor):                  'timestamp': 1413835980.560,                  'upload_date': '20141020',                  'duration': 3238, -            } +            }, +            'skip': 'HTTP Error 404: Not Found',          },          # cobra.be          { @@ -53,16 +55,39 @@ class VRTIE(InfoExtractor):                  'timestamp': 1413967500.494,                  'upload_date': '20141022',                  'duration': 661, -            } +            }, +            'skip': 'HTTP Error 404: Not Found',          },          {              # YouTube video              'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', -            'only_matching': True, +            'md5': 'b8b93da1df1cea6c8556255a796b7d61', +            'info_dict': { +                'id': 'Wji-BZ0oCwg', +                'ext': 'mp4', +                'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', +                'description': 'md5:8e468944dce15567a786a67f74262583', +                'uploader': 'Star Wars', +                'uploader_id': 'starwars', +                'upload_date': '20160407', +            }, +            'add_ie': ['Youtube'],          },          {              'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', -            'only_matching': True, +            'md5': '', +            'info_dict': { +                'id': '2377055', +                'ext': 'mp4', +                'title': 'Cafe Derby', +                'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', +                'upload_date': '20150626', +                'timestamp': 1435305240.769, +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            }          }      ] @@ -98,6 +123,32 @@ class VRTIE(InfoExtractor):                  formats.extend(self._extract_m3u8_formats(                      src, video_id, 'mp4', entry_protocol='m3u8_native',                      m3u8_id='hls', fatal=False)) +                formats.extend(self._extract_f4m_formats( +                    src.replace('playlist.m3u8', 'manifest.f4m'), +                    video_id, f4m_id='hds', fatal=False)) +                if 'data-video-geoblocking="true"' not in webpage: +                    rtmp_formats = self._extract_smil_formats( +                        src.replace('playlist.m3u8', 'jwplayer.smil'), +                        video_id, fatal=False) +                    formats.extend(rtmp_formats) +                    for rtmp_format in rtmp_formats: +                        rtmp_format_c = rtmp_format.copy() +                        rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) +                        del rtmp_format_c['play_path'] +                        del rtmp_format_c['ext'] +                        http_format = rtmp_format_c.copy() +                        http_format.update({ +                            'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), +                            'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), +                            'protocol': 'http', +                        }) +                        rtsp_format = rtmp_format_c.copy() +                        rtsp_format.update({ +                            'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), +                            'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), +                            'protocol': 'rtsp', +                        }) +                        formats.extend([http_format, rtsp_format])              else:                  formats.extend(self._extract_f4m_formats(                      '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 5a41f8ffa..bcb140305 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote  class XNXXIE(InfoExtractor): -    _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)' -    _TEST = { -        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', -        'md5': '0831677e2b4761795f68d417e0b7b445', +    _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' +    _TESTS = [{ +        'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', +        'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0',          'info_dict': { -            'id': '1135332', +            'id': '55awb78',              'ext': 'flv', -            'title': 'lida » Naked Funny Actress  (5)', +            'title': 'Skyrim Test Video',              'age_limit': 18, -        } -    } +        }, +    }, { +        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', +        'only_matching': True, +    }, { +        'url': 'http://www.xnxx.com/video-55awb78/', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 4075b8a4f..83bc1fef2 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -4,17 +4,23 @@ import itertools  import re  from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote  from ..utils import (      int_or_none,      orderedSet, +    parse_duration,      sanitized_Request,      str_to_int,  )  class XTubeIE(InfoExtractor): -    _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)' +    _VALID_URL = r'''(?x) +                        (?: +                            xtube:| +                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-) +                        ) +                        (?P<id>[^/?&#]+) +                    '''      _TESTS = [{          # old URL schema @@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor):              'description': 'contains:an ET kind of thing',              'uploader': 'greenshowers',              'duration': 450, +            'view_count': int, +            'comment_count': int,              'age_limit': 18,          }      }, { @@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor):          req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')          webpage = self._download_webpage(req, display_id) -        flashvars = self._parse_json( -            self._search_regex( -                r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), -            video_id)['flashvars'] - -        title = flashvars.get('title') or self._search_regex( -            r'<h1>([^<]+)</h1>', webpage, 'title') -        video_url = compat_urllib_parse_unquote(flashvars['video_url']) -        duration = int_or_none(flashvars.get('video_duration')) - -        uploader = self._search_regex( -            r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', -            webpage, 'uploader', fatal=False) +        sources = self._parse_json(self._search_regex( +            r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) + +        formats = [] +        for format_id, format_url in sources.items(): +            formats.append({ +                'url': format_url, +                'format_id': format_id, +                'height': int_or_none(format_id), +            }) +        self._sort_formats(formats) + +        title = self._search_regex( +            (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), +            webpage, 'title', group='title')          description = self._search_regex(              r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) +        uploader = self._search_regex( +            (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', +             r'<span[^>]+class="nickname"[^>]*>([^<]+)'), +            webpage, 'uploader', fatal=False) +        duration = parse_duration(self._search_regex( +            r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', +            webpage, 'duration', fatal=False))          view_count = str_to_int(self._search_regex(              r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',              webpage, 'view count', fatal=False)) @@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor):          return {              'id': video_id,              'display_id': display_id, -            'url': video_url,              'title': title,              'description': description,              'uploader': uploader, @@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor):              'view_count': view_count,              'comment_count': comment_count,              'age_limit': 18, +            'formats': formats,          } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 0be8932ad..a66daee46 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -68,6 +68,20 @@ class XuiteIE(InfoExtractor):          },          'skip': 'Video removed',      }, { +        # Video with encoded media id +        # from http://forgetfulbc.blogspot.com/2016/06/date.html +        'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', +        'info_dict': { +            'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', +            'ext': 'mp4', +            'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', +            'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', +            'timestamp': 1466160960, +            'upload_date': '20160617', +            'uploader': 'B.C. & Lowy', +            'uploader_id': '232279340', +        }, +    }, {          'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',          'only_matching': True,      }] @@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor):      def base64_encode_utf8(data):          return base64.b64encode(data.encode('utf-8')).decode('utf-8') -    def _extract_flv_config(self, media_id): -        base64_media_id = self.base64_encode_utf8(media_id) +    def _extract_flv_config(self, encoded_media_id):          flv_config = self._download_xml( -            'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, +            'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,              'flv config')          prop_dict = {}          for prop in flv_config.findall('./property'): @@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor):                  '%s returned error: %s' % (self.IE_NAME, error_msg),                  expected=True) -        video_id = self._html_search_regex( -            r'data-mediaid="(\d+)"', webpage, 'media id') -        flv_config = self._extract_flv_config(video_id) +        encoded_media_id = self._search_regex( +            r'attributes\.name\s*=\s*"([^"]+)"', webpage, +            'encoded media id', default=None) +        if encoded_media_id is None: +            video_id = self._html_search_regex( +                r'data-mediaid="(\d+)"', webpage, 'media id') +            encoded_media_id = self.base64_encode_utf8(video_id) +        flv_config = self._extract_flv_config(encoded_media_id)          FORMATS = {              'audio': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 927a964a4..b0679dfb7 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,6 +19,7 @@ from ..utils import (      mimetype2ext,  ) +from .brightcove import BrightcoveNewIE  from .nbc import NBCSportsVPlayerIE @@ -227,7 +228,12 @@ class YahooIE(InfoExtractor):          # Look for NBCSports iframes          nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)          if nbc_sports_url: -            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +            return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + +        # Look for Brightcove New Studio embeds +        bc_url = BrightcoveNewIE._extract_url(webpage) +        if bc_url: +            return self.url_result(bc_url, BrightcoveNewIE.ie_key())          # Query result is often embedded in webpage as JSON. Sometimes explicit requests          # to video API results in a failure with geo restriction reason therefore using diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 147608ebe..e37f237c7 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -16,7 +16,6 @@ from ..compat import (  from ..utils import (      ExtractorError,      get_element_by_attribute, -    sanitized_Request,  ) @@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor):              headers = {                  'Referer': req_url,              } +            headers.update(self.geo_verification_headers())              self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') -            req = sanitized_Request(req_url, headers=headers) -            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') -            if cn_verification_proxy: -                req.add_header('Ytdl-request-proxy', cn_verification_proxy) - -            raw_data = self._download_json(req, video_id, note=note) +            raw_data = self._download_json(req_url, video_id, note=note, headers=headers)              return raw_data['data'] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 00dd602ff..8aa7dfc41 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -501,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'youtube_include_dash_manifest': True,                  'format': '141',              }, +            'skip': 'format 141 not served anymore',          },          # DASH manifest with encrypted signature          { @@ -517,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },              'params': {                  'youtube_include_dash_manifest': True, -                'format': '141', +                'format': '141/bestaudio[ext=m4a]',              },          },          # JS player signature function name containing $ @@ -537,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },              'params': {                  'youtube_include_dash_manifest': True, -                'format': '141', +                'format': '141/bestaudio[ext=m4a]',              },          },          # Controversy video @@ -618,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',                  'license': 'Standard YouTube License',                  'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', -                'uploader': 'Olympics', +                'uploader': 'Olympic',                  'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',              },              'params': { @@ -671,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',                  'uploader': 'dorappi2000',                  'license': 'Standard YouTube License', -                'formats': 'mincount:33', +                'formats': 'mincount:32',              },          },          # DASH manifest with segment_list @@ -691,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'params': {                  'youtube_include_dash_manifest': True,                  'format': '135',  # bestvideo -            } +            }, +            'skip': 'This live event has ended.',          },          {              # Multifeed videos (multiple cameras), URL is for Main Camera @@ -762,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',              },              'playlist_count': 2, +            'skip': 'Not multifeed anymore',          },          {              'url': 'http://vid.plus/FlRa-iH7PGw', @@ -814,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'params': {                  'skip_download': True,              }, +            'skip': 'This video does not exist.',          },          {              # Video licensed under Creative Commons @@ -1331,7 +1335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      (?:[a-zA-Z-]+="[^"]*"\s+)*?                      (?:title|href)="([^"]+)"\s+                      (?:[a-zA-Z-]+="[^"]*"\s+)*? -                    class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> +                    class="[^"]*"[^>]*>                  [^<]+\.{3}\s*                  </a>              ''', r'\1', video_description) @@ -1726,6 +1730,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          } +class YoutubeSharedVideoIE(InfoExtractor): +    _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?ci=(?P<id>[0-9A-Za-z_-]{11})' +    IE_NAME = 'youtube:shared' + +    _TEST = { +        'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', +        'info_dict': { +            'id': 'uPDB5I9wfp8', +            'ext': 'webm', +            'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', +            'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', +            'upload_date': '20160219', +            'uploader': 'Pocoyo - Português (BR)', +            'uploader_id': 'PocoyoBrazil', +        }, +        'add_ie': ['Youtube'], +        'params': { +            # There are already too many Youtube downloads +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        real_video_id = self._html_search_meta( +            'videoId', webpage, 'YouTube video id', fatal=True) + +        return self.url_result(real_video_id, YoutubeIE.ie_key()) + +  class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):      IE_DESC = 'YouTube.com playlists'      _VALID_URL = r"""(?x)(?: @@ -1941,10 +1978,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):          return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)                  else super(YoutubeChannelIE, cls).suitable(url)) +    def _build_template_url(self, url, channel_id): +        return self._TEMPLATE_URL % channel_id +      def _real_extract(self, url):          channel_id = self._match_id(url) -        url = self._TEMPLATE_URL % channel_id +        url = self._build_template_url(url, channel_id)          # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)          # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -1958,9 +1998,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):              channel_playlist_id = self._html_search_meta(                  'channelId', channel_page, 'channel id', default=None)              if not channel_playlist_id: -                channel_playlist_id = self._search_regex( -                    r'data-(?:channel-external-|yt)id="([^"]+)"', -                    channel_page, 'channel id', default=None) +                channel_url = self._html_search_meta( +                    ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), +                    channel_page, 'channel url', default=None) +                if channel_url: +                    channel_playlist_id = self._search_regex( +                        r'vnd\.youtube://user/([0-9A-Za-z_-]+)', +                        channel_url, 'channel id', default=None)          if channel_playlist_id and channel_playlist_id.startswith('UC'):              playlist_id = 'UU' + channel_playlist_id[2:]              return self.url_result( @@ -1983,20 +2027,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):                  for video_id, video_title in self.extract_videos_from_page(channel_page)]              return self.playlist_result(entries, channel_id) +        try: +            next(self._entries(channel_page, channel_id)) +        except StopIteration: +            alert_message = self._html_search_regex( +                r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', +                channel_page, 'alert', default=None, group='alert') +            if alert_message: +                raise ExtractorError('Youtube said: %s' % alert_message, expected=True) +          return self.playlist_result(self._entries(channel_page, channel_id), channel_id)  class YoutubeUserIE(YoutubeChannelIE):      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' -    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' -    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' +    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' +    _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'      IE_NAME = 'youtube:user'      _TESTS = [{          'url': 'https://www.youtube.com/user/TheLinuxFoundation',          'playlist_mincount': 320,          'info_dict': { -            'title': 'TheLinuxFoundation', +            'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', +            'title': 'Uploads from The Linux Foundation', +        } +    }, { +        # Only available via https://www.youtube.com/c/12minuteathlete/videos +        # but not https://www.youtube.com/user/12minuteathlete/videos +        'url': 'https://www.youtube.com/c/12minuteathlete/videos', +        'playlist_mincount': 249, +        'info_dict': { +            'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', +            'title': 'Uploads from 12 Minute Athlete',          }      }, {          'url': 'ytuser:phihag', @@ -2004,6 +2067,13 @@ class YoutubeUserIE(YoutubeChannelIE):      }, {          'url': 'https://www.youtube.com/c/gametrailers',          'only_matching': True, +    }, { +        'url': 'https://www.youtube.com/gametrailers', +        'only_matching': True, +    }, { +        # This channel is not available. +        'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', +        'only_matching': True,      }]      @classmethod @@ -2016,6 +2086,10 @@ class YoutubeUserIE(YoutubeChannelIE):          else:              return super(YoutubeUserIE, cls).suitable(url) +    def _build_template_url(self, url, channel_id): +        mobj = re.match(self._VALID_URL, url) +        return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) +  class YoutubeLiveIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com live streams' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a7440c582..9737f7002 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object):      def extract_function(self, funcname):          func_m = re.search(              r'''(?x) -                (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* +                (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*                  \((?P<args>[^)]*)\)\s*                  \{(?P<code>[^}]+)\}''' % (                  re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 99ce4131f..c4a85b2c0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -26,9 +26,11 @@ def parseOpts(overrideArguments=None):          except IOError:              return default  # silently skip if file is not present          try: -            res = [] -            for l in optionf: -                res += compat_shlex_split(l, comments=True) +            # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 +            contents = optionf.read() +            if sys.version_info < (3,): +                contents = contents.decode(preferredencoding()) +            res = compat_shlex_split(contents, comments=True)          finally:              optionf.close()          return res @@ -212,10 +214,15 @@ def parseOpts(overrideArguments=None):          help='Make all connections via IPv6 (experimental)',      )      network.add_option( +        '--geo-verification-proxy', +        dest='geo_verification_proxy', default=None, metavar='URL', +        help='Use this proxy to verify the IP address for some geo-restricted sites. ' +        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' +    ) +    network.add_option(          '--cn-verification-proxy',          dest='cn_verification_proxy', default=None, metavar='URL', -        help='Use this proxy to verify the IP address for some Chinese sites. ' -        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' +        help=optparse.SUPPRESS_HELP,      )      selection = optparse.OptionGroup(parser, 'Video Selection') @@ -809,11 +816,11 @@ def parseOpts(overrideArguments=None):              system_conf = []              user_conf = []          else: -            system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf')) +            system_conf = _readOptions('/etc/youtube-dl.conf')              if '--ignore-config' in system_conf:                  user_conf = []              else: -                user_conf = compat_conf(_readUserConf()) +                user_conf = _readUserConf()          argv = system_conf + user_conf + command_line_conf          opts, args = parser.parse_args(argv) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index fa99b0c2a..c1e9eb159 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -363,8 +363,10 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):          input_files = [filename] + sub_filenames          opts = [ -            '-map', '0', -            '-c', 'copy', +            '-map', '0:v', +            '-c:v', 'copy', +            '-map', '0:a', +            '-c:a', 'copy',              # Don't copy the existing subtitles, we may be running the              # postprocessor a second time              '-map', '-0:s', diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index fd49d7435..104807242 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -76,7 +76,7 @@ class Socks4Error(ProxyError):      CODES = {          91: 'request rejected or failed', -        92: 'request rejected becasue SOCKS server cannot connect to identd on the client', +        92: 'request rejected because SOCKS server cannot connect to identd on the client',          93: 'request rejected because the client program and identd report different user-ids'      } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fe175e82c..4c1d0d526 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ                          itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],                                          'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( +    '%d %B %Y', +    '%d %b %Y', +    '%B %d %Y', +    '%b %d %Y', +    '%b %dst %Y %I:%M', +    '%b %dnd %Y %I:%M', +    '%b %dth %Y %I:%M', +    '%Y %m %d', +    '%Y-%m-%d', +    '%Y/%m/%d', +    '%Y/%m/%d %H:%M:%S', +    '%Y-%m-%d %H:%M:%S', +    '%Y-%m-%d %H:%M:%S.%f', +    '%d.%m.%Y %H:%M', +    '%d.%m.%Y %H.%M', +    '%Y-%m-%dT%H:%M:%SZ', +    '%Y-%m-%dT%H:%M:%S.%fZ', +    '%Y-%m-%dT%H:%M:%S.%f0Z', +    '%Y-%m-%dT%H:%M:%S', +    '%Y-%m-%dT%H:%M:%S.%f', +    '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ +    '%d-%m-%Y', +    '%d.%m.%Y', +    '%d.%m.%y', +    '%d/%m/%Y', +    '%d/%m/%y', +    '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ +    '%m-%d-%Y', +    '%m.%d.%Y', +    '%m/%d/%Y', +    '%m/%d/%y', +    '%m/%d/%Y %H:%M:%S', +]) +  def preferredencoding():      """Get preferred encoding. @@ -267,9 +310,17 @@ def get_element_by_id(id, html):      return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): +    return get_element_by_attribute( +        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), +        html, escape_value=False) + + +def get_element_by_attribute(attribute, value, html, escape_value=True):      """Return the content of the tag with the specified attribute in the passed HTML document""" +    value = re.escape(value) if escape_value else value +      m = re.search(r'''(?xs)          <([a-zA-Z0-9:._-]+)           (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? @@ -278,7 +329,7 @@ def get_element_by_attribute(attribute, value, html):          \s*>          (?P<content>.*?)          </\1> -    ''' % (re.escape(attribute), re.escape(value)), html) +    ''' % (re.escape(attribute), value), html)      if not m:          return None @@ -975,6 +1026,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):      https_response = http_response +def extract_timezone(date_str): +    m = re.search( +        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', +        date_str) +    if not m: +        timezone = datetime.timedelta() +    else: +        date_str = date_str[:-len(m.group('tz'))] +        if not m.group('sign'): +            timezone = datetime.timedelta() +        else: +            sign = 1 if m.group('sign') == '+' else -1 +            timezone = datetime.timedelta( +                hours=sign * int(m.group('hours')), +                minutes=sign * int(m.group('minutes'))) +    return timezone, date_str + +  def parse_iso8601(date_str, delimiter='T', timezone=None):      """ Return a UNIX timestamp from the given date """ @@ -984,20 +1053,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):      date_str = re.sub(r'\.[0-9]+', '', date_str)      if timezone is None: -        m = re.search( -            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', -            date_str) -        if not m: -            timezone = datetime.timedelta() -        else: -            date_str = date_str[:-len(m.group(0))] -            if not m.group('sign'): -                timezone = datetime.timedelta() -            else: -                sign = 1 if m.group('sign') == '+' else -1 -                timezone = datetime.timedelta( -                    hours=sign * int(m.group('hours')), -                    minutes=sign * int(m.group('minutes'))) +        timezone, date_str = extract_timezone(date_str) +      try:          date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)          dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1063,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):          pass +def date_formats(day_first=True): +    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + +  def unified_strdate(date_str, day_first=True):      """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1075,11 @@ def unified_strdate(date_str, day_first=True):      upload_date = None      # Replace commas      date_str = date_str.replace(',', ' ') -    # %z (UTC offset) is only supported in python>=3.2 -    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): -        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)      # Remove AM/PM + timezone      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) +    _, date_str = extract_timezone(date_str) -    format_expressions = [ -        '%d %B %Y', -        '%d %b %Y', -        '%B %d %Y', -        '%b %d %Y', -        '%b %dst %Y %I:%M', -        '%b %dnd %Y %I:%M', -        '%b %dth %Y %I:%M', -        '%Y %m %d', -        '%Y-%m-%d', -        '%Y/%m/%d', -        '%Y/%m/%d %H:%M:%S', -        '%Y-%m-%d %H:%M:%S', -        '%Y-%m-%d %H:%M:%S.%f', -        '%d.%m.%Y %H:%M', -        '%d.%m.%Y %H.%M', -        '%Y-%m-%dT%H:%M:%SZ', -        '%Y-%m-%dT%H:%M:%S.%fZ', -        '%Y-%m-%dT%H:%M:%S.%f0Z', -        '%Y-%m-%dT%H:%M:%S', -        '%Y-%m-%dT%H:%M:%S.%f', -        '%Y-%m-%dT%H:%M', -    ] -    if day_first: -        format_expressions.extend([ -            '%d-%m-%Y', -            '%d.%m.%Y', -            '%d.%m.%y', -            '%d/%m/%Y', -            '%d/%m/%y', -            '%d/%m/%Y %H:%M:%S', -        ]) -    else: -        format_expressions.extend([ -            '%m-%d-%Y', -            '%m.%d.%Y', -            '%m/%d/%Y', -            '%m/%d/%y', -            '%m/%d/%Y %H:%M:%S', -        ]) -    for expression in format_expressions: +    for expression in date_formats(day_first):          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')          except ValueError: @@ -1076,6 +1095,29 @@ def unified_strdate(date_str, day_first=True):          return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): +    if date_str is None: +        return None + +    date_str = date_str.replace(',', ' ') + +    pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) +    timezone, date_str = extract_timezone(date_str) + +    # Remove AM/PM + timezone +    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + +    for expression in date_formats(day_first): +        try: +            dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta +            return calendar.timegm(dt.timetuple()) +        except ValueError: +            pass +    timetuple = email.utils.parsedate_tz(date_str) +    if timetuple: +        return calendar.timegm(timetuple.timetuple()) + +  def determine_ext(url, default_ext='unknown_video'):      if url is None:          return default_ext @@ -1410,6 +1452,8 @@ def shell_quote(args):  def smuggle_url(url, data):      """ Pass additional data in a URL for internal use. """ +    url, idata = unsmuggle_url(url, {}) +    data.update(idata)      sdata = compat_urllib_parse_urlencode(          {'__youtubedl_smuggle': json.dumps(data)})      return url + '#' + sdata @@ -1591,6 +1635,11 @@ class HEADRequest(compat_urllib_request.Request):          return 'HEAD' +class PUTRequest(compat_urllib_request.Request): +    def get_method(self): +        return 'PUT' + +  def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):      if get_attr:          if v is not None: @@ -1626,6 +1675,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):          return default +def strip_or_none(v): +    return None if v is None else v.strip() + +  def parse_duration(s):      if not isinstance(s, compat_basestring):          return None @@ -1882,7 +1935,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):      req_headers.update(headers)      req_data = data or req.data      req_url = update_url_query(url or req.get_full_url(), query) -    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request +    req_get_method = req.get_method() +    if req_get_method == 'HEAD': +        req_type = HEADRequest +    elif req_get_method == 'PUT': +        req_type = PUTRequest +    else: +        req_type = compat_urllib_request.Request      new_req = req_type(          req_url, data=req_data, headers=req_headers,          origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -2046,6 +2105,7 @@ def mimetype2ext(mt):          return ext      _, _, res = mt.rpartition('/') +    res = res.lower()      return {          '3gpp': '3gp', @@ -2057,6 +2117,12 @@ def mimetype2ext(mt):          'x-flv': 'flv',          'x-mp4-fragmented': 'mp4',          'x-ms-wmv': 'wmv', +        'mpegurl': 'm3u8', +        'x-mpegurl': 'm3u8', +        'vnd.apple.mpegurl': 'm3u8', +        'dash+xml': 'mpd', +        'f4m': 'f4m', +        'f4m+xml': 'f4m',      }.get(res, res) @@ -2897,3 +2963,7 @@ def parse_m3u8_attributes(attrib):              val = val[1:-1]          info[key] = val      return info + + +def urshift(val, n): +    return val >> n if val >= 0 else (val + 0x100000000) >> n diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4a9f162c1..728ad2d50 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2016.06.20' +__version__ = '2016.07.09.2'  | 
