diff options
| -rw-r--r-- | AUTHORS | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/audioboom.py | 66 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 39 | ||||
| -rw-r--r-- | youtube_dl/extractor/dw.py | 85 | ||||
| -rw-r--r-- | youtube_dl/extractor/facebook.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/googledrive.py | 12 | 
7 files changed, 194 insertions, 20 deletions
| @@ -162,3 +162,4 @@ Robin Houtevelts  Patrick Griffis  Aidan Rowe  mutantmonkey +Ben Congdon diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 98de5ddff..c5ca01ee7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .arte import (  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE  from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE  from .audiomack import AudiomackIE, AudiomackAlbumIE  from .azubu import AzubuIE, AzubuLiveIE  from .baidu import BaiduVideoIE @@ -188,6 +189,10 @@ from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .dropbox import DropboxIE +from .dw import ( +    DWIE, +    DWArticleIE, +)  from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', +        'md5': '63a8d73a055c6ed0f1e51921a10a5a76', +        'info_dict': { +            'id': '4279833', +            'ext': 'mp3', +            'title': '3/09/2016 Czaban Hour 3', +            'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans', +            'duration': 2245.72, +            'uploader': 'Steve Czaban', +            'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        clip = None + +        clip_store = self._parse_json( +            self._search_regex( +                r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, +                webpage, 'clip store', default='{}', group='json'), +            video_id, fatal=False) +        if clip_store: +            clips = clip_store.get('clips') +            if clips and isinstance(clips, list) and isinstance(clips[0], dict): +                clip = clips[0] + +        def from_clip(field): +            if clip: +                clip.get(field) + +        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( +            'audio', webpage, 'audio url') +        title = from_clip('title') or self._og_search_title(webpage) +        description = from_clip('description') or self._og_search_description(webpage) + +        duration = float_or_none(from_clip('duration') or self._html_search_meta( +            'weibo:audio:duration', webpage)) + +        uploader = from_clip('author') or self._og_search_property( +            'audio:artist', webpage, 'uploader', fatal=False) +        uploader_url = from_clip('author_url') or self._html_search_meta( +            'audioboo:channel', webpage, 'uploader url') + +        return { +            'id': video_id, +            'url': audio_url, +            'title': title, +            'description': description, +            'duration': duration, +            'uploader': uploader, +            'uploader_url': uploader_url, +        } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfa9c82f6..0b8b906ab 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -48,6 +48,7 @@ from ..utils import (      determine_protocol,      parse_duration,      mimetype2ext, +    update_url_query,  ) @@ -345,7 +346,7 @@ class InfoExtractor(object):      def IE_NAME(self):          return compat_str(type(self).__name__[:-2]) -    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):          """ Returns the response handle """          if note is None:              self.report_download_webpage(video_id) @@ -354,6 +355,12 @@ class InfoExtractor(object):                  self.to_screen('%s' % (note,))              else:                  self.to_screen('%s: %s' % (video_id, note)) +        # data, headers and query params will be ignored for `Request` objects +        if isinstance(url_or_request, compat_str): +            if query: +                url_or_request = update_url_query(url_or_request, query) +            if data or headers: +                url_or_request = sanitized_Request(url_or_request, data, headers or {})          try:              return self._downloader.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -369,13 +376,13 @@ class InfoExtractor(object):                  self._downloader.report_warning(errmsg)                  return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] -        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) +        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)          if urlh is False:              assert not fatal              return False @@ -462,13 +469,13 @@ class InfoExtractor(object):          return content -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):          """ Returns the data of the page as a string """          success = False          try_count = 0          while success is False:              try: -                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) +                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)                  success = True              except compat_http_client.IncompleteRead as e:                  try_count += 1 @@ -483,10 +490,10 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True, encoding=None): +                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) +            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)          if xml_string is False:              return xml_string          if transform_source: @@ -497,10 +504,10 @@ class InfoExtractor(object):                         note='Downloading JSON metadata',                         errnote='Unable to download JSON metadata',                         transform_source=None, -                       fatal=True, encoding=None): +                       fatal=True, encoding=None, data=None, headers=None, query=None):          json_string = self._download_webpage(              url_or_request, video_id, note, errnote, fatal=fatal, -            encoding=encoding) +            encoding=encoding, data=data, headers=headers, query=query)          if (not fatal) and json_string is False:              return None          return self._parse_json( @@ -1140,8 +1147,8 @@ class InfoExtractor(object):                  out.append('{%s}%s' % (namespace, c))          return '/'.join(out) -    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): -        smil = self._download_smil(smil_url, video_id, fatal=fatal) +    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)          if smil is False:              assert not fatal @@ -1158,10 +1165,10 @@ class InfoExtractor(object):              return {}          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) -    def _download_smil(self, smil_url, video_id, fatal=True): +    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):          return self._download_xml(              smil_url, video_id, 'Downloading SMIL file', -            'Unable to download SMIL file', fatal=fatal) +            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):          namespace = self._parse_smil_namespace(smil) @@ -1447,8 +1454,9 @@ class InfoExtractor(object):                          continue                      representation_attrib = adaptation_set.attrib.copy()                      representation_attrib.update(representation.attrib) -                    mime_type = representation_attrib.get('mimeType') -                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') +                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory +                    mime_type = representation_attrib['mimeType'] +                    content_type = mime_type.split('/')[0]                      if content_type == 'text':                          # TODO implement WebVTT downloading                          pass @@ -1471,6 +1479,7 @@ class InfoExtractor(object):                          f = {                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,                              'url': base_url, +                            'ext': mimetype2ext(mime_type),                              'width': int_or_none(representation_attrib.get('width')),                              'height': int_or_none(representation_attrib.get('height')),                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..b6c985547 --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): +    IE_NAME = 'dw' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' +    _TESTS = [{ +        # video +        'url': 'http://www.dw.com/en/intelligent-light/av-19112290', +        'md5': '7372046e1815c5a534b43f3c3c36e6e9', +        'info_dict': { +            'id': '19112290', +            'ext': 'mp4', +            'title': 'Intelligent light', +            'description': 'md5:90e00d5881719f2a6a5827cb74985af1', +            'upload_date': '20160311', +        } +    }, { +        # audio +        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', +        'md5': '2814c9a1321c3a51f8a7aeb067a360dd', +        'info_dict': { +            'id': '19111941', +            'ext': 'mp3', +            'title': 'WorldLink: My business', +            'description': 'md5:bc9ca6e4e063361e21c920c53af12405', +            'upload_date': '20160311', +        } +    }] + +    def _real_extract(self, url): +        media_id = self._match_id(url) +        webpage = self._download_webpage(url, media_id) +        hidden_inputs = self._hidden_inputs(webpage) +        title = hidden_inputs['media_title'] + +        formats = [] +        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': +            formats = self._extract_smil_formats( +                'http://www.dw.com/smil/v-%s' % media_id, media_id, +                transform_source=lambda s: s.replace( +                    'rtmp://tv-od.dw.de/flash/', +                    'http://tv-download.dw.de/dwtv_video/flv/')) +        else: +            formats = [{'url': hidden_inputs['file_name']}] + +        return { +            'id': media_id, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': hidden_inputs.get('preview_image'), +            'duration': int_or_none(hidden_inputs.get('file_duration')), +            'upload_date': hidden_inputs.get('display_date'), +            'formats': formats, +        } + + +class DWArticleIE(InfoExtractor): +    IE_NAME = 'dw:article' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', +        'md5': '8ca657f9d068bbef74d6fc38b97fc869', +        'info_dict': { +            'id': '19105868', +            'ext': 'mp4', +            'title': 'The harsh life of refugees in Idomeni', +            'description': 'md5:196015cc7e48ebf474db9399420043c7', +            'upload_date': '20160310', +        } +    } + +    def _real_extract(self, url): +        article_id = self._match_id(url) +        webpage = self._download_webpage(url, article_id) +        hidden_inputs = self._hidden_inputs(webpage) +        media_id = hidden_inputs['media_id'] +        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') +        media_url = compat_urlparse.urljoin(url, media_path) +        return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 5e8589479..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -38,7 +38,8 @@ class FacebookIE(InfoExtractor):                                  story\.php                              )\?(?:.*?)(?:v|video_id|story_fbid)=|                              [^/]+/videos/(?:[^/]+/)?| -                            [^/]+/posts/ +                            [^/]+/posts/| +                            groups/[^/]+/permalink/                          )|                      facebook:                  ) @@ -123,6 +124,9 @@ class FacebookIE(InfoExtractor):      }, {          'url': 'facebook:544765982287235',          'only_matching': True, +    }, { +        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', +        'only_matching': True,      }]      def _login(self): diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import (  class GoogleDriveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' -    _TEST = { +    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' +    _TESTS = [{          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',          'md5': '881f7700aec4f538571fa1e0eed4a7b6',          'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):              'title': 'Big Buck Bunny.mp4',              'duration': 46,          } -    } +    }, { +        # video id is longer than 28 characters +        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', +        'only_matching': True, +    }]      _FORMATS_EXT = {          '5': 'flv',          '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', +            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',              webpage)          if mobj:              return 'https://drive.google.com/file/d/%s' % mobj.group('id') | 
