diff options
27 files changed, 568 insertions, 210 deletions
@@ -144,3 +144,5 @@ Lee Jenkins  Anssi Hannula  Lukáš Lalinský  Qijiang Fan +Rémy Léone +Marco Ferragina diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a9820c1f5..5016ba4bc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -200,7 +200,6 @@   - **GodTube**   - **GoldenMoustache**   - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com   - **Goshgay**   - **Groupon**   - **Hark** @@ -671,6 +670,7 @@   - **WSJ**: Wall Street Journal   - **XBef**   - **XboxClips** + - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me   - **XHamster**   - **XHamsterEmbed**   - **XMinus** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0a90da73c..59c82f65d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -60,7 +60,10 @@ from .bloomberg import BloombergIE  from .bpb import BpbIE  from .br import BRIE  from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( +    BrightcoveLegacyIE, +    BrightcoveNewIE, +)  from .buzzfeed import BuzzFeedIE  from .byutv import BYUtvIE  from .c56 import C56IE @@ -221,7 +224,6 @@ from .goldenmoustache import GoldenMoustacheIE  from .golem import GolemIE  from .googleplus import GooglePlusIE  from .googlesearch import GoogleSearchIE -from .gorillavid import GorillaVidIE  from .goshgay import GoshgayIE  from .groupon import GrouponIE  from .hark import HarkIE @@ -418,7 +420,10 @@ from .nowness import (      NownessPlaylistIE,      NownessSeriesIE,  ) -from .nowtv import NowTVIE +from .nowtv import ( +    NowTVIE, +    NowTVListIE, +)  from .nowvideo import NowVideoIE  from .npo import (      NPOIE, @@ -456,10 +461,7 @@ from .orf import (  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE -from .periscope import ( -    PeriscopeIE, -    QuickscopeIE, -) +from .periscope import PeriscopeIE  from .philharmoniedeparis import PhilharmonieDeParisIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE @@ -786,6 +788,7 @@ from .wrzuta import WrzutaIE  from .wsj import WSJIE  from .xbef import XBefIE  from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE  from .xhamster import (      XHamsterIE,      XHamsterEmbedIE, diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 184a14a4f..5b2c0dc9a 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -15,7 +15,7 @@ class AlJazeeraIE(InfoExtractor):              'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',              'uploader': 'Al Jazeera English',          }, -        'add_ie': ['Brightcove'], +        'add_ie': ['BrightcoveLegacy'],          'skip': 'Not accessible from Travis CI server',      } @@ -32,5 +32,5 @@ class AlJazeeraIE(InfoExtractor):                  'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'                  '&%40videoPlayer={0}'.format(brightcove_id)              ), -            'ie_key': 'Brightcove', +            'ie_key': 'BrightcoveLegacy',          } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a55a6dbc9..33b296eaf 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -27,7 +27,7 @@ class BBCCoUkIE(InfoExtractor):      _MEDIASELECTOR_URLS = [          # Provides HQ HLS streams with even better quality that pc mediaset but fails          # with geolocation in some cases when it's even not geo restricted at all (e.g. -        # http://www.bbc.co.uk/programmes/b06bp7lf) +        # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',          'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',      ] @@ -334,7 +334,7 @@ class BBCCoUkIE(InfoExtractor):                  return self._download_media_selector_url(                      mediaselector_url % programme_id, programme_id)              except BBCCoUkIE.MediaSelectionError as e: -                if e.id in ('notukerror', 'geolocation'): +                if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):                      last_exception = e                      continue                  self._raise_extractor_error(e) @@ -345,7 +345,7 @@ class BBCCoUkIE(InfoExtractor):              media_selection = self._download_xml(                  url, programme_id, 'Downloading media selection XML')          except ExtractorError as ee: -            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: +            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):                  media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))              else:                  raise diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1686cdde1..14ee05f21 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -20,12 +20,17 @@ from ..utils import (      ExtractorError,      find_xpath_attr,      fix_xml_ampersands, +    float_or_none, +    js_to_json, +    int_or_none, +    parse_iso8601,      unescapeHTML,      unsmuggle_url,  ) -class BrightcoveIE(InfoExtractor): +class BrightcoveLegacyIE(InfoExtractor): +    IE_NAME = 'brightcove:legacy'      _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -346,3 +351,172 @@ class BrightcoveIE(InfoExtractor):          if 'url' not in info and not info.get('formats'):              raise ExtractorError('Unable to extract video url for %s' % info['id'])          return info + + +class BrightcoveNewIE(InfoExtractor): +    IE_NAME = 'brightcove:new' +    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)' +    _TESTS = [{ +        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', +        'md5': 'c8100925723840d4b0d243f7025703be', +        'info_dict': { +            'id': '4463358922001', +            'ext': 'mp4', +            'title': 'Meet the man behind Popcorn Time', +            'description': 'md5:eac376a4fe366edc70279bfb681aea16', +            'duration': 165.768, +            'timestamp': 1441391203, +            'upload_date': '20150904', +            'uploader_id': '929656772001', +            'formats': 'mincount:22', +        }, +    }, { +        # with rtmp streams +        'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', +        'info_dict': { +            'id': '4279049078001', +            'ext': 'mp4', +            'title': 'Titansgrave: Chapter 0', +            'description': 'Titansgrave: Chapter 0', +            'duration': 1242.058, +            'timestamp': 1433556729, +            'upload_date': '20150606', +            'uploader_id': '4036320279001', +            'formats': 'mincount:41', +        }, +        'params': { +            'skip_download': True, +        } +    }] + +    @staticmethod +    def _extract_urls(webpage): +        # Reference: +        # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe +        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) +        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + +        entries = [] + +        # Look for iframe embeds [1] +        for _, url in re.findall( +                r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): +            entries.append(url) + +        # Look for embed_in_page embeds [2] +        for video_id, account_id, player_id, embed in re.findall( +                # According to examples from [3] it's unclear whether video id +                # may be optional and what to do when it is +                r'''(?sx) +                    <video[^>]+ +                        data-video-id=["\'](\d+)["\'][^>]*>.*? +                    </video>.*? +                    <script[^>]+ +                        src=["\'](?:https?:)?//players\.brightcove\.net/ +                        (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js +                ''', webpage): +            entries.append( +                'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' +                % (account_id, player_id, embed, video_id)) + +        return entries + +    def _real_extract(self, url): +        account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + +        webpage = self._download_webpage( +            'http://players.brightcove.net/%s/%s_%s/index.min.js' +            % (account_id, player_id, embed), video_id) + +        policy_key = None + +        catalog = self._search_regex( +            r'catalog\(({.+?})\);', webpage, 'catalog', default=None) +        if catalog: +            catalog = self._parse_json( +                js_to_json(catalog), video_id, fatal=False) +            if catalog: +                policy_key = catalog.get('policyKey') + +        if not policy_key: +            policy_key = self._search_regex( +                r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', +                webpage, 'policy key', group='pk') + +        req = compat_urllib_request.Request( +            'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' +            % (account_id, video_id), +            headers={'Accept': 'application/json;pk=%s' % policy_key}) +        json_data = self._download_json(req, video_id) + +        title = json_data['name'] + +        formats = [] +        for source in json_data.get('sources', []): +            source_type = source.get('type') +            src = source.get('src') +            if source_type == 'application/x-mpegURL': +                if not src: +                    continue +                m3u8_formats = self._extract_m3u8_formats( +                    src, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +            else: +                streaming_src = source.get('streaming_src') +                stream_name, app_name = source.get('stream_name'), source.get('app_name') +                if not src and not streaming_src and (not stream_name or not app_name): +                    continue +                tbr = float_or_none(source.get('avg_bitrate'), 1000) +                height = int_or_none(source.get('height')) +                f = { +                    'tbr': tbr, +                    'width': int_or_none(source.get('width')), +                    'height': height, +                    'filesize': int_or_none(source.get('size')), +                    'container': source.get('container'), +                    'vcodec': source.get('codec'), +                    'ext': source.get('container').lower(), +                } + +                def build_format_id(kind): +                    format_id = kind +                    if tbr: +                        format_id += '-%dk' % int(tbr) +                    if height: +                        format_id += '-%dp' % height +                    return format_id + +                if src or streaming_src: +                    f.update({ +                        'url': src or streaming_src, +                        'format_id': build_format_id('http' if src else 'http-streaming'), +                        'preference': 2 if src else 1, +                    }) +                else: +                    f.update({ +                        'url': app_name, +                        'play_path': stream_name, +                        'format_id': build_format_id('rtmp'), +                    }) +                formats.append(f) +        self._sort_formats(formats) + +        description = json_data.get('description') +        thumbnail = json_data.get('thumbnail') +        timestamp = parse_iso8601(json_data.get('published_at')) +        duration = float_or_none(json_data.get('duration'), 1000) +        tags = json_data.get('tags', []) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'uploader_id': account_id, +            'formats': formats, +            'tags': tags, +        } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 75fffb156..43f05d278 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,6 +1,8 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import smuggle_url  class CBSIE(InfoExtractor): @@ -46,13 +48,19 @@ class CBSIE(InfoExtractor):      def _real_extract(self, url):          display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) +        request = compat_urllib_request.Request(url) +        # Android UA is served with higher quality (720p) streams (see +        # https://github.com/rg3/youtube-dl/issues/7490) +        request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)') +        webpage = self._download_webpage(request, display_id)          real_id = self._search_regex(              [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],              webpage, 'real video ID')          return {              '_type': 'url_transparent',              'ie_key': 'ThePlatform', -            'url': 'theplatform:%s' % real_id, +            'url': smuggle_url( +                'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id, +                {'force_smil_url': True}),              'display_id': display_id,          } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 52e61d85b..f9a64a0a2 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -67,9 +67,12 @@ class CBSNewsIE(InfoExtractor):                  'format_id': format_id,              }              if uri.startswith('rtmp'): +                play_path = re.sub( +                    r'{slistFilePath}', '', +                    uri.split('<break>')[-1].split('{break}')[-1])                  fmt.update({                      'app': 'ondemand?auth=cbs', -                    'play_path': 'mp4:' + uri.split('<break>')[-1], +                    'play_path': 'mp4:' + play_path,                      'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',                      'page_url': 'http://www.cbsnews.com',                      'ext': 'flv', diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index 1f00386fe..f5a31058d 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  import base64 +import re  from .common import InfoExtractor  from ..compat import compat_urllib_request @@ -9,7 +10,7 @@ from ..utils import qualities  class DumpertIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)' +    _VALID_URL = r'(?P<protocol>https?)://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)'      _TESTS = [{          'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/',          'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -26,9 +27,11 @@ class DumpertIE(InfoExtractor):      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        protocol = mobj.group('protocol') -        url = 'https://www.dumpert.nl/mediabase/' + video_id +        url = '%s://www.dumpert.nl/mediabase/%s' % (protocol, video_id)          req = compat_urllib_request.Request(url)          req.add_header('Cookie', 'nsfw=1; cpc=10')          webpage = self._download_webpage(req, video_id) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index f5f13689c..7f21d7410 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -45,11 +45,20 @@ class FunnyOrDieIE(InfoExtractor):          links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) -        bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates') -        bitrates = [int(b) for b in bitrates.rstrip(',').split(',')] -        bitrates.sort() +        m3u8_url = self._search_regex( +            r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1', +            webpage, 'm3u8 url', default=None, group='url')          formats = [] + +        m3u8_formats = self._extract_m3u8_formats( +            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) +        if m3u8_formats: +            formats.extend(m3u8_formats) + +        bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)[,/]', m3u8_url)] +        bitrates.sort() +          for bitrate in bitrates:              for link in links:                  formats.append({ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d0b486d2a..51516a38a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -30,7 +30,10 @@ from ..utils import (      url_basename,      xpath_text,  ) -from .brightcove import BrightcoveIE +from .brightcove import ( +    BrightcoveLegacyIE, +    BrightcoveNewIE, +)  from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE @@ -275,7 +278,7 @@ class GenericIE(InfoExtractor):          # it also tests brightcove videos that need to set the 'Referer' in the          # http requests          { -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],              'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',              'info_dict': {                  'id': '2765128793001', @@ -299,7 +302,7 @@ class GenericIE(InfoExtractor):                  'uploader': 'thestar.com',                  'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',              }, -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],          },          {              'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -314,7 +317,7 @@ class GenericIE(InfoExtractor):          },          {              # https://github.com/rg3/youtube-dl/issues/3541 -            'add_ie': ['Brightcove'], +            'add_ie': ['BrightcoveLegacy'],              'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',              'info_dict': {                  'id': '3866516442001', @@ -1031,6 +1034,17 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'cinemasnob',              }, +        }, +        # BrightcoveInPageEmbed embed +        { +            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', +            'info_dict': { +                'id': '4238694884001', +                'ext': 'flv', +                'title': 'Tabletop: Dread, Last Thoughts', +                'description': 'Tabletop: Dread, Last Thoughts', +                'duration': 51690, +            },          }      ] @@ -1290,14 +1304,14 @@ class GenericIE(InfoExtractor):              return self.playlist_result(                  urlrs, playlist_id=video_id, playlist_title=video_title) -        # Look for BrightCove: -        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) +        # Look for Brightcove Legacy Studio embeds +        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)          if bc_urls:              self.to_screen('Brightcove video detected.')              entries = [{                  '_type': 'url',                  'url': smuggle_url(bc_url, {'Referer': url}), -                'ie_key': 'Brightcove' +                'ie_key': 'BrightcoveLegacy'              } for bc_url in bc_urls]              return { @@ -1307,6 +1321,11 @@ class GenericIE(InfoExtractor):                  'entries': entries,              } +        # Look for Brightcove New Studio embeds +        bc_urls = BrightcoveNewIE._extract_urls(webpage) +        if bc_urls: +            return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 3d78f78c4..fce179000 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -10,8 +10,8 @@ from ..utils import (  class InstagramIE(InfoExtractor): -    _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)' -    _TEST = { +    _VALID_URL = r'https://instagram\.com/p/(?P<id>[^/?#&]+)' +    _TESTS = [{          'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',          'md5': '0d2da106a9d2631273e192b372806516',          'info_dict': { @@ -21,7 +21,10 @@ class InstagramIE(InfoExtractor):              'title': 'Video by naomipq',              'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',          } -    } +    }, { +        'url': 'https://instagram.com/p/-Cmh1cukG2/', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 9a207b2cd..3d7e7e003 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -25,7 +25,7 @@ class LyndaBaseIE(InfoExtractor):          self._login()      def _login(self): -        (username, password) = self._get_login_info() +        username, password = self._get_login_info()          if username is None:              return @@ -83,6 +83,10 @@ class LyndaBaseIE(InfoExtractor):              raise ExtractorError('Unable to log in')      def _logout(self): +        username, _ = self._get_login_info() +        if username is None: +            return +          self._download_webpage(              'http://www.lynda.com/ajax/logout.aspx', None,              'Logging out', 'Unable to log out', fatal=False) diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 04d779890..6b15fc2e5 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -4,10 +4,14 @@ import re  from .common import InfoExtractor  from ..compat import ( +    compat_urllib_request,      compat_urlparse,  )  from ..utils import (      ExtractorError, +    NO_DEFAULT, +    encode_dict, +    urlencode_postdata,  ) @@ -38,19 +42,40 @@ class NovaMovIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) -        page = self._download_webpage( -            'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page') +        url = 'http://%s/video/%s' % (self._HOST, video_id) -        if re.search(self._FILE_DELETED_REGEX, page) is not None: -            raise ExtractorError('Video %s does not exist' % video_id, expected=True) +        webpage = self._download_webpage( +            url, video_id, 'Downloading video page') -        filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey') +        if re.search(self._FILE_DELETED_REGEX, webpage) is not None: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) -        title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False) -        description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False) +        def extract_filekey(default=NO_DEFAULT): +            return self._search_regex( +                self._FILEKEY_REGEX, webpage, 'filekey', default=default) + +        filekey = extract_filekey(default=None) + +        if not filekey: +            fields = self._hidden_inputs(webpage) +            post_url = self._search_regex( +                r'<form[^>]+action=(["\'])(?P<url>.+?)\1', webpage, +                'post url', default=url, group='url') +            if not post_url.startswith('http'): +                post_url = compat_urlparse.urljoin(url, post_url) +            request = compat_urllib_request.Request( +                post_url, urlencode_postdata(encode_dict(fields))) +            request.add_header('Content-Type', 'application/x-www-form-urlencoded') +            request.add_header('Referer', post_url) +            webpage = self._download_webpage( +                request, video_id, 'Downloading continue to the video page') + +        filekey = extract_filekey() + +        title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title', fatal=False) +        description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False)          api_response = self._download_webpage(              'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id, diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b97f62fdb..0fba55833 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,7 @@  # encoding: utf-8  from __future__ import unicode_literals -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE  from .common import InfoExtractor  from ..utils import ExtractorError  from ..compat import ( @@ -22,10 +22,10 @@ class NownessBaseIE(InfoExtractor):                              'http://www.nowness.com/iframe?id=%s' % video_id, video_id,                              note='Downloading player JavaScript',                              errnote='Unable to download player JavaScript') -                        bc_url = BrightcoveIE._extract_brightcove_url(player_code) +                        bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)                          if bc_url is None:                              raise ExtractorError('Could not find player definition') -                        return self.url_result(bc_url, 'Brightcove') +                        return self.url_result(bc_url, 'BrightcoveLegacy')                      elif source == 'vimeo':                          return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')                      elif source == 'youtube': diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index b0bdffc4e..67e34b294 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,6 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import ( @@ -13,8 +15,63 @@ from ..utils import (  ) -class NowTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' +class NowTVBaseIE(InfoExtractor): +    _VIDEO_FIELDS = ( +        'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', +        'broadcastStartDate', 'seoUrl', 'duration', 'files', +        'format.defaultImage169Format', 'format.defaultImage169Logo') + +    def _extract_video(self, info, display_id=None): +        video_id = compat_str(info['id']) + +        files = info['files'] +        if not files: +            if info.get('geoblocked', False): +                raise ExtractorError( +                    'Video %s is not available from your location due to geo restriction' % video_id, +                    expected=True) +            if not info.get('free', True): +                raise ExtractorError( +                    'Video %s is not available for free' % video_id, expected=True) + +        formats = [] +        for item in files['items']: +            if determine_ext(item['path']) != 'f4v': +                continue +            app, play_path = remove_start(item['path'], '/').split('/', 1) +            formats.append({ +                'url': 'rtmpe://fms.rtl.de', +                'app': app, +                'play_path': 'mp4:%s' % play_path, +                'ext': 'flv', +                'page_url': 'http://rtlnow.rtl.de', +                'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', +                'tbr': int_or_none(item.get('bitrate')), +            }) +        self._sort_formats(formats) + +        title = info['title'] +        description = info.get('articleLong') or info.get('articleShort') +        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') +        duration = parse_duration(info.get('duration')) + +        f = info.get('format', {}) +        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + +        return { +            'id': video_id, +            'display_id': display_id or info.get('seoUrl'), +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'duration': duration, +            'formats': formats, +        } + + +class NowTVIE(NowTVBaseIE): +    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:list/[^/]+/)?(?P<id>[^/]+)/(?:player|preview)'      _TESTS = [{          # rtl @@ -23,7 +80,7 @@ class NowTVIE(InfoExtractor):              'id': '203519',              'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',              'ext': 'flv', -            'title': 'Die neuen Bauern und eine Hochzeit', +            'title': 'Inka Bause stellt die neuen Bauern vor',              'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',              'thumbnail': 're:^https?://.*\.jpg$',              'timestamp': 1432580700, @@ -136,58 +193,65 @@ class NowTVIE(InfoExtractor):      }]      def _real_extract(self, url): -        display_id = self._match_id(url) -        display_id_split = display_id.split('/') -        if len(display_id) > 2: -            display_id = '/'.join((display_id_split[0], display_id_split[-1])) +        mobj = re.match(self._VALID_URL, url) +        display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id'))          info = self._download_json( -            'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, -            display_id) +            'https://api.nowtv.de/v3/movies/%s?fields=%s' +            % (display_id, ','.join(self._VIDEO_FIELDS)), display_id) -        video_id = compat_str(info['id']) +        return self._extract_video(info, display_id) -        files = info['files'] -        if not files: -            if info.get('geoblocked', False): -                raise ExtractorError( -                    'Video %s is not available from your location due to geo restriction' % video_id, -                    expected=True) -            if not info.get('free', True): -                raise ExtractorError( -                    'Video %s is not available for free' % video_id, expected=True) -        formats = [] -        for item in files['items']: -            if determine_ext(item['path']) != 'f4v': -                continue -            app, play_path = remove_start(item['path'], '/').split('/', 1) -            formats.append({ -                'url': 'rtmpe://fms.rtl.de', -                'app': app, -                'play_path': 'mp4:%s' % play_path, -                'ext': 'flv', -                'page_url': 'http://rtlnow.rtl.de', -                'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', -                'tbr': int_or_none(item.get('bitrate')), -            }) -        self._sort_formats(formats) +class NowTVListIE(NowTVBaseIE): +    _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/list/(?P<id>[^?/#&]+)$' -        title = info['title'] -        description = info.get('articleLong') or info.get('articleShort') -        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') -        duration = parse_duration(info.get('duration')) +    _SHOW_FIELDS = ('title', ) +    _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) -        f = info.get('format', {}) -        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') +    _TESTS = [{ +        'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell', +        'info_dict': { +            'id': '17006', +            'title': 'stern TV - Aktuell', +        }, +        'playlist_count': 1, +    }, { +        'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8', +        'info_dict': { +            'id': '20716', +            'title': 'Das Supertalent - FREE Staffel 8', +        }, +        'playlist_count': 14, +    }] -        return { -            'id': video_id, -            'display_id': display_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -            'formats': formats, -        } +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        show_id = mobj.group('show_id') +        season_id = mobj.group('id') + +        fields = [] +        fields.extend(self._SHOW_FIELDS) +        fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) +        fields.extend( +            'formatTabs.formatTabPages.container.movies.%s' % field +            for field in self._VIDEO_FIELDS) + +        list_info = self._download_json( +            'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php' +            % (','.join(fields), show_id), +            season_id) + +        season = next( +            season for season in list_info['formatTabs']['items'] +            if season.get('seoheadline') == season_id) + +        title = '%s - %s' % (list_info['title'], season['headline']) + +        entries = [] +        for container in season['formatTabPages']['items']: +            for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: +                entries.append(self._extract_video(info)) + +        return self.playlist_result( +            entries, compat_str(season.get('id') or season_id), title) diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index 17baa9679..57ee3d366 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -7,9 +7,9 @@ class NowVideoIE(NovaMovIE):      IE_NAME = 'nowvideo'      IE_DESC = 'NowVideo' -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|ec|sx|eu|at|ag|co|li)'} +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} -    _HOST = 'www.nowvideo.ch' +    _HOST = 'www.nowvideo.to'      _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'      _FILEKEY_REGEX = r'var fkzd="([^"]+)";' diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 887c8020d..63cc764bb 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -2,16 +2,12 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -)  from ..utils import parse_iso8601  class PeriscopeIE(InfoExtractor):      IE_DESC = 'Periscope' -    _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'      # Alive example URLs can be found here http://onperiscope.com/      _TESTS = [{          'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -29,6 +25,9 @@ class PeriscopeIE(InfoExtractor):      }, {          'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',          'only_matching': True, +    }, { +        'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', +        'only_matching': True,      }]      def _call_api(self, method, value): @@ -81,24 +80,3 @@ class PeriscopeIE(InfoExtractor):              'thumbnails': thumbnails,              'formats': formats,          } - - -class QuickscopeIE(InfoExtractor): -    IE_DESC = 'Quick Scope' -    _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' -    _TEST = { -        'url': 'https://watchonperiscope.com/broadcast/56180087', -        'only_matching': True, -    } - -    def _real_extract(self, url): -        broadcast_id = self._match_id(url) -        request = compat_urllib_request.Request( -            'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({ -                'broadcast_id': broadcast_id, -                'entry_ticket': '', -                'from_push': 'false', -                'uses_sessions': 'true', -            }).encode('utf-8')) -        return self.url_result( -            self._download_json(request, broadcast_id)['share_url'], 'Periscope') diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index a16b73ff4..e417bf661 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -57,16 +57,21 @@ class RuutuIE(InfoExtractor):                      extract_formats(child)                  elif child.tag.endswith('File'):                      video_url = child.text -                    if not video_url or video_url in processed_urls or 'NOT_USED' in video_url: +                    if (not video_url or video_url in processed_urls or +                            any(p in video_url for p in ('NOT_USED', 'NOT-USED'))):                          return                      processed_urls.append(video_url)                      ext = determine_ext(video_url)                      if ext == 'm3u8': -                        formats.extend(self._extract_m3u8_formats( -                            video_url, video_id, 'mp4', m3u8_id='hls')) +                        m3u8_formats = self._extract_m3u8_formats( +                            video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +                        if m3u8_formats: +                            formats.extend(m3u8_formats)                      elif ext == 'f4m': -                        formats.extend(self._extract_f4m_formats( -                            video_url, video_id, f4m_id='hds')) +                        f4m_formats = self._extract_f4m_formats( +                            video_url, video_id, f4m_id='hds', fatal=False) +                        if f4m_formats: +                            formats.extend(f4m_formats)                      else:                          proto = compat_urllib_parse_urlparse(video_url).scheme                          if not child.tag.startswith('HTTP') and proto != 'rtmp': diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index a602af692..e9e33d0a3 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE  from ..compat import (      compat_urllib_parse, @@ -112,11 +112,11 @@ class SafariIE(SafariBaseIE):              '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),              part) -        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)          if not bc_url:              raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) -        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') +        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')  class SafariCourseIE(SafariBaseIE): diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index c2d0d36a6..ebb5d6ec0 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE  from ..utils import RegexNotFoundError, ExtractorError  class SpaceIE(InfoExtractor):      _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'      _TEST = { -        'add_ie': ['Brightcove'], +        'add_ie': ['BrightcoveLegacy'],          'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',          'info_dict': {              'id': '2780937028001', @@ -31,8 +31,8 @@ class SpaceIE(InfoExtractor):              brightcove_url = self._og_search_video_url(webpage)          except RegexNotFoundError:              # Other videos works fine with the info from the object -            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) +            brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)          if brightcove_url is None:              raise ExtractorError(                  'The webpage does not contain a video', expected=True) -        return self.url_result(brightcove_url, BrightcoveIE.ie_key()) +        return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 13263614c..d6d038a8d 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE  from .discovery import DiscoveryIE  from ..compat import compat_urlparse @@ -66,6 +66,6 @@ class TlcDeIE(InfoExtractor):          return {              '_type': 'url', -            'url': BrightcoveIE._extract_brightcove_url(iframe), -            'ie': BrightcoveIE.ie_key(), +            'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), +            'ie': BrightcoveLegacyIE.ie_key(),          } diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 9d3e46b94..055047340 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -9,6 +9,8 @@ from ..utils import (      float_or_none,      xpath_text,      remove_end, +    int_or_none, +    ExtractorError,  ) @@ -18,7 +20,7 @@ class TwitterCardIE(InfoExtractor):      _TESTS = [          {              'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', -            'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', +            'md5': '4fa26a35f9d1bf4b646590ba8e84be19',              'info_dict': {                  'id': '560070183650213889',                  'ext': 'mp4', @@ -50,6 +52,20 @@ class TwitterCardIE(InfoExtractor):                  'uploader': 'OMG! Ubuntu!',                  'uploader_id': 'omgubuntu',              }, +            'add_ie': ['Youtube'], +        }, +        { +            'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', +            'md5': 'ab2745d0b0ce53319a534fccaa986439', +            'info_dict': { +                'id': 'iBb2x00UVlv', +                'ext': 'mp4', +                'upload_date': '20151113', +                'uploader_id': '1189339351084113920', +                'uploader': '@ArsenalTerje', +                'title': 'Vine by @ArsenalTerje', +            }, +            'add_ie': ['Vine'],          }      ] @@ -69,11 +85,11 @@ class TwitterCardIE(InfoExtractor):              request.add_header('User-Agent', user_agent)              webpage = self._download_webpage(request, video_id) -            youtube_url = self._html_search_regex( -                r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', -                webpage, 'youtube iframe', default=None) -            if youtube_url: -                return self.url_result(youtube_url, 'Youtube') +            iframe_url = self._html_search_regex( +                r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', +                webpage, 'video iframe', default=None) +            if iframe_url: +                return self.url_result(iframe_url)              config = self._parse_json(self._html_search_regex(                  r'data-player-config="([^"]+)"', webpage, 'data player config'), @@ -120,9 +136,9 @@ class TwitterIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'      _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' -    _TEST = { +    _TESTS = [{          'url': 'https://twitter.com/freethenipple/status/643211948184596480', -        'md5': '31cd83a116fc41f99ae3d909d4caf6a0', +        'md5': 'db6612ec5d03355953c3ca9250c97e5e',          'info_dict': {              'id': '643211948184596480',              'ext': 'mp4', @@ -133,7 +149,30 @@ class TwitterIE(InfoExtractor):              'uploader': 'FREE THE NIPPLE',              'uploader_id': 'freethenipple',          }, -    } +    }, { +        'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', +        'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', +        'info_dict': { +            'id': '657991469417025536', +            'ext': 'mp4', +            'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', +            'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', +            'thumbnail': 're:^https?://.*\.png', +            'uploader': 'Gifs', +            'uploader_id': 'giphz', +        }, +    }, { +        'url': 'https://twitter.com/starwars/status/665052190608723968', +        'md5': '39b7199856dee6cd4432e72c74bc69d4', +        'info_dict': { +            'id': '665052190608723968', +            'ext': 'mp4', +            'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', +            'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', +            'uploader_id': 'starwars', +            'uploader': 'Star Wars', +        }, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -144,23 +183,46 @@ class TwitterIE(InfoExtractor):          username = remove_end(self._og_search_title(webpage), ' on Twitter') -        title = self._og_search_description(webpage).strip('').replace('\n', ' ') +        title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”')          # strip  'https -_t.co_BJYgOjSeGA' junk from filenames -        mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) -        title, short_url = mobj.groups() - -        card_id = self._search_regex( -            r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') -        card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id +        title = re.sub(r'\s+(https?://[^ ]+)', '', title) -        return { -            '_type': 'url_transparent', -            'ie_key': 'TwitterCard', +        info = {              'uploader_id': user_id,              'uploader': username, -            'url': card_url,              'webpage_url': url, -            'description': '%s on Twitter: "%s %s"' % (username, title, short_url), +            'description': '%s on Twitter: "%s"' % (username, description),              'title': username + ' - ' + title,          } + +        card_id = self._search_regex( +            r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) +        if card_id: +            card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id +            info.update({ +                '_type': 'url_transparent', +                'ie_key': 'TwitterCard', +                'url': card_url, +            }) +            return info + +        mobj = re.search(r'''(?x) +            <video[^>]+class="animated-gif"[^>]+ +                (?:data-height="(?P<height>\d+)")?[^>]+ +                (?:data-width="(?P<width>\d+)")?[^>]+ +                (?:poster="(?P<poster>[^"]+)")?[^>]*>\s* +                <source[^>]+video-src="(?P<url>[^"]+)" +        ''', webpage) + +        if mobj: +            info.update({ +                'id': twid, +                'url': mobj.group('url'), +                'height': int_or_none(mobj.group('height')), +                'width': int_or_none(mobj.group('width')), +                'thumbnail': mobj.group('poster'), +            }) +            return info + +        raise ExtractorError('There\'s not video in this tweet.') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ca716c8f5..b72341a2b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -49,8 +49,8 @@ class VimeoBaseInfoExtractor(InfoExtractor):          }))          login_request = compat_urllib_request.Request(self._LOGIN_URL, data)          login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') -        login_request.add_header('Cookie', 'vuid=%s' % vuid)          login_request.add_header('Referer', self._LOGIN_URL) +        self._set_vimeo_cookie('vuid', vuid)          self._download_webpage(login_request, None, False, 'Wrong login info')      def _extract_xsrft_and_vuid(self, webpage): @@ -62,6 +62,9 @@ class VimeoBaseInfoExtractor(InfoExtractor):              webpage, 'vuid', group='vuid')          return xsrft, vuid +    def _set_vimeo_cookie(self, name, value): +        self._set_cookie('vimeo.com', name, value) +  class VimeoIE(VimeoBaseInfoExtractor):      """Information extractor for vimeo.com.""" @@ -217,8 +220,8 @@ class VimeoIE(VimeoBaseInfoExtractor):              url = url.replace('http://', 'https://')          password_request = compat_urllib_request.Request(url + '/password', data)          password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') -        password_request.add_header('Cookie', 'clip_test2=1; vuid=%s' % vuid)          password_request.add_header('Referer', url) +        self._set_vimeo_cookie('vuid', vuid)          return self._download_webpage(              password_request, video_id,              'Verifying the password', 'Wrong password') @@ -384,47 +387,29 @@ class VimeoIE(VimeoBaseInfoExtractor):              like_count = None              comment_count = None -        # Vimeo specific: extract request signature and timestamp -        sig = config['request']['signature'] -        timestamp = config['request']['timestamp'] - -        # Vimeo specific: extract video codec and quality information -        # First consider quality, then codecs, then take everything -        codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] -        files = {'hd': [], 'sd': [], 'other': []} -        config_files = config["video"].get("files") or config["request"].get("files") -        for codec_name, codec_extension in codecs: -            for quality in config_files.get(codec_name, []): -                format_id = '-'.join((codec_name, quality)).lower() -                key = quality if quality in files else 'other' -                video_url = None -                if isinstance(config_files[codec_name], dict): -                    file_info = config_files[codec_name][quality] -                    video_url = file_info.get('url') -                else: -                    file_info = {} -                if video_url is None: -                    video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ -                        % (video_id, sig, timestamp, quality, codec_name.upper()) - -                files[key].append({ -                    'ext': codec_extension, -                    'url': video_url, -                    'format_id': format_id, -                    'width': int_or_none(file_info.get('width')), -                    'height': int_or_none(file_info.get('height')), -                    'tbr': int_or_none(file_info.get('bitrate')), -                })          formats = [] -        m3u8_url = config_files.get('hls', {}).get('all') +        config_files = config['video'].get('files') or config['request'].get('files', {}) +        for f in config_files.get('progressive', []): +            video_url = f.get('url') +            if not video_url: +                continue +            formats.append({ +                'url': video_url, +                'format_id': 'http-%s' % f.get('quality'), +                'width': int_or_none(f.get('width')), +                'height': int_or_none(f.get('height')), +                'fps': int_or_none(f.get('fps')), +                'tbr': int_or_none(f.get('bitrate')), +            }) +        m3u8_url = config_files.get('hls', {}).get('url')          if m3u8_url:              m3u8_formats = self._extract_m3u8_formats(                  m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False)              if m3u8_formats:                  formats.extend(m3u8_formats) -        for key in ('other', 'sd', 'hd'): -            formats += files[key] -        self._sort_formats(formats) +        # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps +        # at the same time without actual units specified. This lead to wrong sorting. +        self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id'))          subtitles = {}          text_tracks = config['request'].get('text_tracks') @@ -494,8 +479,8 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):          password_url = compat_urlparse.urljoin(page_url, password_path)          password_request = compat_urllib_request.Request(password_url, post)          password_request.add_header('Content-type', 'application/x-www-form-urlencoded') -        password_request.add_header('Cookie', 'vuid=%s' % vuid) -        self._set_cookie('vimeo.com', 'xsrft', token) +        self._set_vimeo_cookie('vuid', vuid) +        self._set_vimeo_cookie('xsrft', token)          return self._download_webpage(              password_request, list_id, diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 2ddf29a69..5a897371d 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -84,6 +84,5 @@ class WSJIE(InfoExtractor):              'duration': duration,              'upload_date': upload_date,              'title': title, -            'formats': formats,              'categories': categories,          } diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/xfileshare.py index d23e3eac1..952515c98 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/xfileshare.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8  from __future__ import unicode_literals  import re @@ -15,11 +15,11 @@ from ..utils import (  ) -class GorillaVidIE(InfoExtractor): -    IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com' +class XFileShareIE(InfoExtractor): +    IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'      _VALID_URL = r'''(?x)          https?://(?P<host>(?:www\.)? -            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com))/ +            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/          (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?      ''' @@ -76,6 +76,13 @@ class GorillaVidIE(InfoExtractor):              'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4',              'thumbnail': 're:http://.*\.jpg',          } +    }, { +        'url': 'http://vidto.me/ku5glz52nqe1.html', +        'info_dict': { +            'id': 'ku5glz52nqe1', +            'ext': 'mp4', +            'title': 'test' +        }      }]      def _real_extract(self, url): @@ -104,13 +111,18 @@ class GorillaVidIE(InfoExtractor):              webpage = self._download_webpage(req, video_id, 'Downloading video page') -        title = self._search_regex( -            [r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'>Watch (.+) '], -            webpage, 'title', default=None) or self._og_search_title(webpage) +        title = (self._search_regex( +            [r'style="z-index: [0-9]+;">([^<]+)</span>', +             r'<td nowrap>([^<]+)</td>', +             r'>Watch (.+) ', +             r'<h2 class="video-page-head">([^<]+)</h2>'], +            webpage, 'title', default=None) or self._og_search_title(webpage)).strip()          video_url = self._search_regex( -            r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url') +            [r'file\s*:\s*["\'](http[^"\']+)["\'],', +             r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'], +            webpage, 'file url')          thumbnail = self._search_regex( -            r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', fatal=False) +            r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)          formats = [{              'format_id': 'sd', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b3d254005..6585d60d5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.11.10' +__version__ = '2015.11.13'  | 
