diff options
Diffstat (limited to 'youtube_dl')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/audioboom.py | 66 | ||||
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 30 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 49 | ||||
| -rw-r--r-- | youtube_dl/extractor/dw.py | 85 | ||||
| -rw-r--r-- | youtube_dl/extractor/facebook.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 39 | ||||
| -rw-r--r-- | youtube_dl/extractor/googledrive.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/kaltura.py | 67 | ||||
| -rw-r--r-- | youtube_dl/extractor/makerschannel.py | 40 | ||||
| -rw-r--r-- | youtube_dl/extractor/minoto.py | 56 | ||||
| -rw-r--r-- | youtube_dl/extractor/mixcloud.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/safari.py | 59 | ||||
| -rw-r--r-- | youtube_dl/extractor/vice.py | 78 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 38 | 
15 files changed, 527 insertions, 115 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 98de5ddff..c5b80f4aa 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .arte import (  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE  from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE  from .audiomack import AudiomackIE, AudiomackAlbumIE  from .azubu import AzubuIE, AzubuLiveIE  from .baidu import BaiduVideoIE @@ -188,6 +189,10 @@ from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .dropbox import DropboxIE +from .dw import ( +    DWIE, +    DWArticleIE, +)  from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE @@ -384,6 +389,7 @@ from .lynda import (  from .m6 import M6IE  from .macgamestore import MacGameStoreIE  from .mailru import MailRuIE +from .makerschannel import MakersChannelIE  from .makertv import MakerTVIE  from .malemotion import MalemotionIE  from .matchtv import MatchTVIE @@ -393,6 +399,7 @@ from .metacritic import MetacriticIE  from .mgoon import MgoonIE  from .minhateca import MinhatecaIE  from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE  from .miomio import MioMioIE  from .mit import TechTVMITIE, MITIE, OCWMITIE  from .mitele import MiTeleIE @@ -829,7 +836,10 @@ from .vgtv import (      VGTVIE,  )  from .vh1 import VH1IE -from .vice import ViceIE +from .vice import ( +    ViceIE, +    ViceShowIE, +)  from .viddler import ViddlerIE  from .videodetective import VideoDetectiveIE  from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', +        'md5': '63a8d73a055c6ed0f1e51921a10a5a76', +        'info_dict': { +            'id': '4279833', +            'ext': 'mp3', +            'title': '3/09/2016 Czaban Hour 3', +            'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans', +            'duration': 2245.72, +            'uploader': 'Steve Czaban', +            'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        clip = None + +        clip_store = self._parse_json( +            self._search_regex( +                r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, +                webpage, 'clip store', default='{}', group='json'), +            video_id, fatal=False) +        if clip_store: +            clips = clip_store.get('clips') +            if clips and isinstance(clips, list) and isinstance(clips[0], dict): +                clip = clips[0] + +        def from_clip(field): +            if clip: +                clip.get(field) + +        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( +            'audio', webpage, 'audio url') +        title = from_clip('title') or self._og_search_title(webpage) +        description = from_clip('description') or self._og_search_description(webpage) + +        duration = float_or_none(from_clip('duration') or self._html_search_meta( +            'weibo:audio:duration', webpage)) + +        uploader = from_clip('author') or self._og_search_property( +            'audio:artist', webpage, 'uploader', fatal=False) +        uploader_url = from_clip('author_url') or self._html_search_meta( +            'audioboo:channel', webpage, 'uploader url') + +        return { +            'id': video_id, +            'url': audio_url, +            'title': title, +            'description': description, +            'duration': duration, +            'uploader': uploader, +            'uploader_url': uploader_url, +        } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9d0dfb961..e62b3860e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -10,7 +10,6 @@ from ..utils import (      int_or_none,      parse_duration,      parse_iso8601, -    remove_end,      unescapeHTML,  )  from ..compat import ( @@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',          'info_dict': {              'id': '3662a707-0af9-3149-963f-47bea720b460', -            'title': 'BBC Blogs - Adam Curtis - BUGGER', +            'title': 'BUGGER',          },          'playlist_count': 18,      }, { @@ -670,10 +669,18 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/sport/0/football/34475836',          'info_dict': {              'id': '34475836', -            'title': 'What Liverpool can expect from Klopp', +            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',          },          'playlist_count': 3,      }, { +        # school report article with single video +        'url': 'http://www.bbc.co.uk/schoolreport/35744779', +        'info_dict': { +            'id': '35744779', +            'title': 'School which breaks down barriers in Jerusalem', +        }, +        'playlist_count': 1, +    }, {          # single video with playlist URL from weather section          'url': 'http://www.bbc.com/weather/features/33601775',          'only_matching': True, @@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):          json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)          timestamp = json_ld_info.get('timestamp') +          playlist_title = json_ld_info.get('title') -        playlist_description = json_ld_info.get('description') +        if not playlist_title: +            playlist_title = self._og_search_title( +                webpage, default=None) or self._html_search_regex( +                r'<title>(.+?)</title>', webpage, 'playlist title', default=None) +            if playlist_title: +                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + +        playlist_description = json_ld_info.get( +            'description') or self._og_search_description(webpage, default=None)          if not timestamp:              timestamp = parse_iso8601(self._search_regex( @@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):                                  playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))          if entries: -            playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') -            playlist_description = playlist_description or self._og_search_description(webpage, default=None)              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):                  'subtitles': subtitles,              } -        playlist_title = self._html_search_regex( -            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title') -        playlist_description = self._og_search_description(webpage, default=None) -          def extract_all(pattern):              return list(filter(None, map(                  lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfa9c82f6..ecd7da767 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -48,6 +48,7 @@ from ..utils import (      determine_protocol,      parse_duration,      mimetype2ext, +    update_url_query,  ) @@ -345,7 +346,7 @@ class InfoExtractor(object):      def IE_NAME(self):          return compat_str(type(self).__name__[:-2]) -    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):          """ Returns the response handle """          if note is None:              self.report_download_webpage(video_id) @@ -354,6 +355,12 @@ class InfoExtractor(object):                  self.to_screen('%s' % (note,))              else:                  self.to_screen('%s: %s' % (video_id, note)) +        # data, headers and query params will be ignored for `Request` objects +        if isinstance(url_or_request, compat_str): +            if query: +                url_or_request = update_url_query(url_or_request, query) +            if data or headers: +                url_or_request = sanitized_Request(url_or_request, data, headers or {})          try:              return self._downloader.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -369,13 +376,13 @@ class InfoExtractor(object):                  self._downloader.report_warning(errmsg)                  return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] -        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) +        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)          if urlh is False:              assert not fatal              return False @@ -462,13 +469,13 @@ class InfoExtractor(object):          return content -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):          """ Returns the data of the page as a string """          success = False          try_count = 0          while success is False:              try: -                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) +                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)                  success = True              except compat_http_client.IncompleteRead as e:                  try_count += 1 @@ -483,10 +490,10 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True, encoding=None): +                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) +            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)          if xml_string is False:              return xml_string          if transform_source: @@ -497,10 +504,10 @@ class InfoExtractor(object):                         note='Downloading JSON metadata',                         errnote='Unable to download JSON metadata',                         transform_source=None, -                       fatal=True, encoding=None): +                       fatal=True, encoding=None, data=None, headers=None, query=None):          json_string = self._download_webpage(              url_or_request, video_id, note, errnote, fatal=fatal, -            encoding=encoding) +            encoding=encoding, data=data, headers=headers, query=query)          if (not fatal) and json_string is False:              return None          return self._parse_json( @@ -966,6 +973,13 @@ class InfoExtractor(object):          if manifest is False:              return [] +        return self._parse_f4m_formats( +            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, +            transform_source=transform_source, fatal=fatal) + +    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, +                           transform_source=lambda s: fix_xml_ampersands(s).strip(), +                           fatal=True):          formats = []          manifest_version = '1.0'          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') @@ -991,7 +1005,8 @@ class InfoExtractor(object):                  # bitrate in f4m downloader                  if determine_ext(manifest_url) == 'f4m':                      formats.extend(self._extract_f4m_formats( -                        manifest_url, video_id, preference, f4m_id, fatal=fatal)) +                        manifest_url, video_id, preference=preference, f4m_id=f4m_id, +                        transform_source=transform_source, fatal=fatal))                      continue              tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({ @@ -1140,8 +1155,8 @@ class InfoExtractor(object):                  out.append('{%s}%s' % (namespace, c))          return '/'.join(out) -    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): -        smil = self._download_smil(smil_url, video_id, fatal=fatal) +    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)          if smil is False:              assert not fatal @@ -1158,10 +1173,10 @@ class InfoExtractor(object):              return {}          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) -    def _download_smil(self, smil_url, video_id, fatal=True): +    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):          return self._download_xml(              smil_url, video_id, 'Downloading SMIL file', -            'Unable to download SMIL file', fatal=fatal) +            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):          namespace = self._parse_smil_namespace(smil) @@ -1447,8 +1462,9 @@ class InfoExtractor(object):                          continue                      representation_attrib = adaptation_set.attrib.copy()                      representation_attrib.update(representation.attrib) -                    mime_type = representation_attrib.get('mimeType') -                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') +                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory +                    mime_type = representation_attrib['mimeType'] +                    content_type = mime_type.split('/')[0]                      if content_type == 'text':                          # TODO implement WebVTT downloading                          pass @@ -1471,6 +1487,7 @@ class InfoExtractor(object):                          f = {                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,                              'url': base_url, +                            'ext': mimetype2ext(mime_type),                              'width': int_or_none(representation_attrib.get('width')),                              'height': int_or_none(representation_attrib.get('height')),                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..b6c985547 --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): +    IE_NAME = 'dw' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' +    _TESTS = [{ +        # video +        'url': 'http://www.dw.com/en/intelligent-light/av-19112290', +        'md5': '7372046e1815c5a534b43f3c3c36e6e9', +        'info_dict': { +            'id': '19112290', +            'ext': 'mp4', +            'title': 'Intelligent light', +            'description': 'md5:90e00d5881719f2a6a5827cb74985af1', +            'upload_date': '20160311', +        } +    }, { +        # audio +        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', +        'md5': '2814c9a1321c3a51f8a7aeb067a360dd', +        'info_dict': { +            'id': '19111941', +            'ext': 'mp3', +            'title': 'WorldLink: My business', +            'description': 'md5:bc9ca6e4e063361e21c920c53af12405', +            'upload_date': '20160311', +        } +    }] + +    def _real_extract(self, url): +        media_id = self._match_id(url) +        webpage = self._download_webpage(url, media_id) +        hidden_inputs = self._hidden_inputs(webpage) +        title = hidden_inputs['media_title'] + +        formats = [] +        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': +            formats = self._extract_smil_formats( +                'http://www.dw.com/smil/v-%s' % media_id, media_id, +                transform_source=lambda s: s.replace( +                    'rtmp://tv-od.dw.de/flash/', +                    'http://tv-download.dw.de/dwtv_video/flv/')) +        else: +            formats = [{'url': hidden_inputs['file_name']}] + +        return { +            'id': media_id, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': hidden_inputs.get('preview_image'), +            'duration': int_or_none(hidden_inputs.get('file_duration')), +            'upload_date': hidden_inputs.get('display_date'), +            'formats': formats, +        } + + +class DWArticleIE(InfoExtractor): +    IE_NAME = 'dw:article' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', +        'md5': '8ca657f9d068bbef74d6fc38b97fc869', +        'info_dict': { +            'id': '19105868', +            'ext': 'mp4', +            'title': 'The harsh life of refugees in Idomeni', +            'description': 'md5:196015cc7e48ebf474db9399420043c7', +            'upload_date': '20160310', +        } +    } + +    def _real_extract(self, url): +        article_id = self._match_id(url) +        webpage = self._download_webpage(url, article_id) +        hidden_inputs = self._hidden_inputs(webpage) +        media_id = hidden_inputs['media_id'] +        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') +        media_url = compat_urlparse.urljoin(url, media_path) +        return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 5e8589479..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -38,7 +38,8 @@ class FacebookIE(InfoExtractor):                                  story\.php                              )\?(?:.*?)(?:v|video_id|story_fbid)=|                              [^/]+/videos/(?:[^/]+/)?| -                            [^/]+/posts/ +                            [^/]+/posts/| +                            groups/[^/]+/permalink/                          )|                      facebook:                  ) @@ -123,6 +124,9 @@ class FacebookIE(InfoExtractor):      }, {          'url': 'facebook:544765982287235',          'only_matching': True, +    }, { +        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', +        'only_matching': True,      }]      def _login(self): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca745ae41..8121f04a5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1242,28 +1242,34 @@ class GenericIE(InfoExtractor):              full_response = self._request_webpage(request, video_id)              head_response = full_response +        info_dict = { +            'id': video_id, +            'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +        } +          # Check for direct link to a video          content_type = head_response.headers.get('Content-Type', '')          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)          if m:              upload_date = unified_strdate(                  head_response.headers.get('Last-Modified')) -            formats = [] -            if m.group('format_id').endswith('mpegurl'): +            format_id = m.group('format_id') +            if format_id.endswith('mpegurl'):                  formats = self._extract_m3u8_formats(url, video_id, 'mp4') +            elif format_id == 'f4m': +                formats = self._extract_f4m_formats(url, video_id)              else:                  formats = [{                      'format_id': m.group('format_id'),                      'url': url,                      'vcodec': 'none' if m.group('type') == 'audio' else None                  }] -            return { -                'id': video_id, -                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +            info_dict.update({                  'direct': True,                  'formats': formats,                  'upload_date': upload_date, -            } +            }) +            return info_dict          if not self._downloader.params.get('test', False) and not is_intentional:              force = self._downloader.params.get('force_generic_extractor', False) @@ -1291,13 +1297,12 @@ class GenericIE(InfoExtractor):                  'URL could be a direct video link, returning it as such.')              upload_date = unified_strdate(                  head_response.headers.get('Last-Modified')) -            return { -                'id': video_id, -                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +            info_dict.update({                  'direct': True,                  'url': url,                  'upload_date': upload_date, -            } +            }) +            return info_dict          webpage = self._webpage_read_content(              full_response, url, video_id, prefix=first_bytes) @@ -1314,12 +1319,12 @@ class GenericIE(InfoExtractor):              elif doc.tag == '{http://xspf.org/ns/0/}playlist':                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): -                return { -                    'id': video_id, -                    'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), -                    'formats': self._parse_mpd_formats( -                        doc, video_id, mpd_base_url=url.rpartition('/')[0]), -                } +                info_dict['formats'] = self._parse_mpd_formats( +                    doc, video_id, mpd_base_url=url.rpartition('/')[0]) +                return info_dict +            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): +                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) +                return info_dict          except compat_xml_parse_error:              pass @@ -1985,6 +1990,8 @@ class GenericIE(InfoExtractor):                  entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')              elif ext == 'mpd':                  entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) +            elif ext == 'f4m': +                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)              else:                  entry_info_dict['url'] = video_url diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import (  class GoogleDriveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' -    _TEST = { +    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' +    _TESTS = [{          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',          'md5': '881f7700aec4f538571fa1e0eed4a7b6',          'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):              'title': 'Big Buck Bunny.mp4',              'duration': 46,          } -    } +    }, { +        # video id is longer than 28 characters +        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', +        'only_matching': True, +    }]      _FORMATS_EXT = {          '5': 'flv',          '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', +            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',              webpage)          if mobj:              return 'https://drive.google.com/file/d/%s' % mobj.group('id') diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ccbc39c66..44d7c84a1 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -8,6 +8,7 @@ from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,      compat_urlparse, +    compat_parse_qs,  )  from ..utils import (      clean_html, @@ -20,21 +21,17 @@ from ..utils import (  class KalturaIE(InfoExtractor):      _VALID_URL = r'''(?x)                  (?: -                    kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)| +                    kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|                      https?://                          (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/                          (?:                              (?:                                  # flash player -                                index\.php/kwidget/ -                                (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/ -                                (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)| +                                index\.php/kwidget|                                  # html5 player -                                html5/html5lib/ -                                (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+) -                                .*\?.*\bwid=_(?P<partner_id_html5>\d+) +                                html5/html5lib/[^/]+/mwEmbedFrame\.php                              ) -                        ) +                        )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?                  )                  '''      _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' @@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):          url, smuggled_data = unsmuggle_url(url, {})          mobj = re.match(self._VALID_URL, url) -        partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') -        entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - -        info, flavor_assets = self._get_video_info(entry_id, partner_id) +        partner_id, entry_id = mobj.group('partner_id', 'id') +        ks = None +        if partner_id and entry_id: +            info, flavor_assets = self._get_video_info(entry_id, partner_id) +        else: +            path, query = mobj.group('path', 'query') +            if not path and not query: +                raise ExtractorError('Invalid URL', expected=True) +            params = {} +            if query: +                params = compat_parse_qs(query) +            if path: +                splitted_path = path.split('/') +                params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) +            if 'wid' in params: +                partner_id = params['wid'][0][1:] +            elif 'p' in params: +                partner_id = params['p'][0] +            else: +                raise ExtractorError('Invalid URL', expected=True) +            if 'entry_id' in params: +                entry_id = params['entry_id'][0] +                info, flavor_assets = self._get_video_info(entry_id, partner_id) +            elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: +                reference_id = params['flashvars[referenceId]'][0] +                webpage = self._download_webpage(url, reference_id) +                entry_data = self._parse_json(self._search_regex( +                    r'window\.kalturaIframePackageData\s*=\s*({.*});', +                    webpage, 'kalturaIframePackageData'), +                    reference_id)['entryResult'] +                info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] +                entry_id = info['id'] +            else: +                raise ExtractorError('Invalid URL', expected=True) +            ks = params.get('flashvars[ks]', [None])[0]          source_url = smuggled_data.get('source_url')          if source_url: @@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):          else:              referrer = None +        def sign_url(unsigned_url): +            if ks: +                unsigned_url += '/ks/%s' % ks +            if referrer: +                unsigned_url += '?referrer=%s' % referrer +            return unsigned_url +          formats = []          for f in flavor_assets:              # Continue if asset is not ready              if f['status'] != 2:                  continue -            video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) -            if referrer: -                video_url += '?referrer=%s' % referrer +            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))              formats.append({                  'format_id': '%(fileExt)s-%(bitrate)s' % f,                  'ext': f.get('fileExt'), @@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):                  'width': int_or_none(f.get('width')),                  'url': video_url,              }) -        m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') -        if referrer: -            m3u8_url += '?referrer=%s' % referrer +        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))          formats.extend(self._extract_m3u8_formats(              m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py new file mode 100644 index 000000000..f5d00e61d --- /dev/null +++ b/youtube_dl/extractor/makerschannel.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class MakersChannelIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', +        'md5': '624a512c6969236b5967bf9286345ad1', +        'info_dict': { +            'id': '849', +            'ext': 'mp4', +            'title': 'Landing a bus on a plane is an epic win', +            'uploader': 'ZoomIn', +            'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', +        } +    } + +    def _real_extract(self, url): +        id_type, url_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, url_id) +        video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') + +        def extract_data_val(attr, fatal=False): +            return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) +        minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + +        return { +            '_type': 'url_transparent', +            'url': 'minoto:%s' % minoto_id, +            'id': extract_data_val('video-id', True), +            'title': extract_data_val('title', True), +            'description': extract_data_val('description'), +            'thumbnail': extract_data_val('image'), +            'uploader': extract_data_val('channel'), +        } diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py new file mode 100644 index 000000000..959a10589 --- /dev/null +++ b/youtube_dl/extractor/minoto.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class MinotoIE(InfoExtractor): +    _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        player_id = mobj.group('player_id') or '1' +        video_id = mobj.group('id') +        video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) +        video_metadata = video_data['video-metadata'] +        formats = [] +        for fmt in video_data['video-files']: +            fmt_url = fmt.get('url') +            if not fmt_url: +                continue +            container = fmt.get('container') +            if container == 'hls': +                formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +            else: +                fmt_profile = fmt.get('profile') or {} +                f = { +                    'format_id': fmt_profile.get('name-short'), +                    'format_note': fmt_profile.get('name'), +                    'url': fmt_url, +                    'container': container, +                    'tbr': int_or_none(fmt.get('bitrate')), +                    'filesize': int_or_none(fmt.get('filesize')), +                    'width': int_or_none(fmt.get('width')), +                    'height': int_or_none(fmt.get('height')), +                } +                codecs = fmt.get('codecs') +                if codecs: +                    codecs = codecs.split(',') +                    if len(codecs) == 2: +                        f.update({ +                            'vcodec': codecs[0], +                            'acodec': codecs[1], +                        }) +                formats.append(f) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_metadata['title'], +            'description': video_metadata.get('description'), +            'thumbnail': video_metadata.get('video-poster', {}).get('url'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index c2b7ed9ab..101497118 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote  from ..utils import (      ExtractorError,      HEADRequest, +    parse_count,      str_to_int,  ) @@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):          uploader_id = self._search_regex(              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)          description = self._og_search_description(webpage) -        like_count = str_to_int(self._search_regex( -            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"', +        like_count = parse_count(self._search_regex( +            r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',              webpage, 'like count', fatal=False))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index a65fc8ed7..256396bb8 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -4,14 +4,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE  from ..utils import (      ExtractorError,      sanitized_Request, -    smuggle_url,      std_headers,      urlencode_postdata, +    update_url_query,  ) @@ -20,21 +19,22 @@ class SafariBaseIE(InfoExtractor):      _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'      _NETRC_MACHINE = 'safari' -    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' +    _API_BASE = 'https://www.safaribooksonline.com/api/v1'      _API_FORMAT = 'json'      LOGGED_IN = False      def _real_initialize(self): -        # We only need to log in once for courses or individual videos -        if not self.LOGGED_IN: -            self._login() -            SafariBaseIE.LOGGED_IN = True +        self._login()      def _login(self): +        # We only need to log in once for courses or individual videos +        if self.LOGGED_IN: +            return +          (username, password) = self._get_login_info()          if username is None: -            self.raise_login_required('safaribooksonline.com account is required') +            return          headers = std_headers.copy()          if 'Referer' not in headers: @@ -67,6 +67,8 @@ class SafariBaseIE(InfoExtractor):                  'Login failed; make sure your credentials are correct and try again.',                  expected=True) +        SafariBaseIE.LOGGED_IN = True +          self.to_screen('Login successful') @@ -86,13 +88,15 @@ class SafariIE(SafariBaseIE):      _TESTS = [{          'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', -        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', +        'md5': 'dcc5a425e79f2564148652616af1f2a3',          'info_dict': { -            'id': '2842601850001', +            'id': '0_qbqx90ic',              'ext': 'mp4', -            'title': 'Introduction', +            'title': 'Introduction to Hadoop Fundamentals LiveLessons', +            'timestamp': 1437758058, +            'upload_date': '20150724', +            'uploader_id': 'stork',          }, -        'skip': 'Requires safaribooksonline account credentials',      }, {          'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',          'only_matching': True, @@ -107,15 +111,30 @@ class SafariIE(SafariBaseIE):          course_id = mobj.group('course_id')          part = mobj.group('part') -        webpage = self._download_webpage( -            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), -            part) +        webpage = self._download_webpage(url, '%s/%s' % (course_id, part)) +        reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id') +        partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id') +        ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id') + +        query = { +            'wid': '_%s' % partner_id, +            'uiconf_id': ui_id, +            'flashvars[referenceId]': reference_id, +        } -        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) -        if not bc_url: -            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) +        if self.LOGGED_IN: +            kaltura_session = self._download_json( +                '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), +                course_id, 'Downloading kaltura session JSON', +                'Unable to download kaltura session JSON', fatal=False) +            if kaltura_session: +                session = kaltura_session.get('session') +                if session: +                    query['flashvars[ks]'] = session -        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') +        return self.url_result(update_url_query( +            'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), +            'Kaltura')  class SafariCourseIE(SafariBaseIE): @@ -141,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):          course_id = self._match_id(url)          course_json = self._download_json( -            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), +            '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),              course_id, 'Downloading course JSON')          if 'chapters' not in course_json: diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 3db6286e4..46c785ae1 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,31 +1,37 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from .ooyala import OoyalaIE  from ..utils import ExtractorError  class ViceIE(InfoExtractor): -    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)' - -    _TESTS = [ -        { -            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', -            'info_dict': { -                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', -                'ext': 'mp4', -                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', -                'duration': 725.983, -            }, -            'params': { -                # Requires ffmpeg (m3u8 manifest) -                'skip_download': True, -            }, -        }, { -            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', -            'only_matching': True, -        } -    ] +    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', +        'info_dict': { +            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', +            'ext': 'mp4', +            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', +            'duration': 725.983, +        }, +        'params': { +            # Requires ffmpeg (m3u8 manifest) +            'skip_download': True, +        }, +    }, { +        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', +        'only_matching': True, +    }, { +        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', +        'only_matching': True, +    }, { +        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):          except ExtractorError:              raise ExtractorError('The page doesn\'t contain a video', expected=True)          return self.url_result(ooyala_url, ie='Ooyala') + + +class ViceShowIE(InfoExtractor): +    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' + +    _TEST = { +        'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', +        'info_dict': { +            'id': 'fuck-thats-delicious-2', +            'title': "Fuck, That's Delicious", +            'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', +        }, +        'playlist_count': 17, +    } + +    def _real_extract(self, url): +        show_id = self._match_id(url) +        webpage = self._download_webpage(url, show_id) + +        entries = [ +            self.url_result(video_url, ViceIE.ie_key()) +            for video_url, _ in re.findall( +                r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"' +                % ViceIE._VALID_URL, webpage)] + +        title = self._search_regex( +            r'<title>(.+?)</title>', webpage, 'title', default=None) +        if title: +            title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() +        description = self._html_search_meta('description', webpage, 'description') + +        return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 22a39a0ab..9fd0ec8d5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1316,6 +1316,17 @@ def format_bytes(bytes):      return '%.2f%s' % (converted, suffix) +def lookup_unit_table(unit_table, s): +    units_re = '|'.join(re.escape(u) for u in unit_table) +    m = re.match( +        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) +    if not m: +        return None +    num_str = m.group('num').replace(',', '.') +    mult = unit_table[m.group('unit')] +    return int(float(num_str) * mult) + +  def parse_filesize(s):      if s is None:          return None @@ -1359,15 +1370,28 @@ def parse_filesize(s):          'Yb': 1000 ** 8,      } -    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) -    m = re.match( -        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) -    if not m: +    return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): +    if s is None:          return None -    num_str = m.group('num').replace(',', '.') -    mult = _UNIT_TABLE[m.group('unit')] -    return int(float(num_str) * mult) +    s = s.strip() + +    if re.match(r'^[\d,.]+$', s): +        return str_to_int(s) + +    _UNIT_TABLE = { +        'k': 1000, +        'K': 1000, +        'm': 1000 ** 2, +        'M': 1000 ** 2, +        'kk': 1000 ** 2, +        'KK': 1000 ** 2, +    } + +    return lookup_unit_table(_UNIT_TABLE, s)  def month_by_name(name): | 
