diff options
| -rw-r--r-- | youtube_dl/extractor/bostonglobe.py | 72 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 28 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 77 | 
4 files changed, 126 insertions, 52 deletions
| diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 000000000..57882fbee --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( +    extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): +    _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' +    _TESTS = [ +        { +            'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', +            'md5': '0a62181079c85c2d2b618c9a738aedaf', +            'info_dict': { +                'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', +                'id': '5320421710001', +                'ext': 'mp4', +                'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', +                'timestamp': 1486877593, +                'upload_date': '20170212', +                'uploader_id': '245991542', +            }, +        }, +        { +            # Embedded youtube video; we hand it off to the Generic extractor. +            'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', +            'md5': '582b40327089d5c0c949b3c54b13c24b', +            'info_dict': { +                'title': "Who Is Matt Damon's Favorite Batman?", +                'id': 'ZW1QCnlA6Qc', +                'ext': 'mp4', +                'upload_date': '20170217', +                'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', +                'uploader': 'The Late Late Show with James Corden', +                'uploader_id': 'TheLateLateShow', +            }, +            'expected_warnings': ['404'], +        }, +    ] + +    def _real_extract(self, url): +        page_id = self._match_id(url) +        webpage = self._download_webpage(url, page_id) + +        page_title = self._og_search_title(webpage, default=None) + +        # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> +        entries = [] +        for video in re.findall(r'(?i)(<video[^>]+>)', webpage): +            attrs = extract_attributes(video) + +            video_id = attrs.get('data-brightcove-video-id') +            account_id = attrs.get('data-account') +            player_id = attrs.get('data-player') +            embed = attrs.get('data-embed') + +            if video_id and account_id and player_id and embed: +                entries.append( +                    'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' +                    % (account_id, player_id, embed, video_id)) + +        if len(entries) == 0: +            return self.url_result(url, 'Generic') +        elif len(entries) == 1: +            return self.url_result(entries[0], 'BrightcoveNew') +        else: +            return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b51799bfa..0852b8e8c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -36,34 +36,35 @@ from ..utils import (      clean_html,      compiled_regex_type,      determine_ext, +    determine_protocol,      error_to_compat_str,      ExtractorError, +    extract_attributes,      fix_xml_ampersands,      float_or_none,      GeoRestrictedError,      GeoUtils,      int_or_none,      js_to_json, +    mimetype2ext, +    orderedSet, +    parse_codecs, +    parse_duration,      parse_iso8601, +    parse_m3u8_attributes,      RegexNotFoundError, -    sanitize_filename,      sanitized_Request, +    sanitize_filename,      unescapeHTML,      unified_strdate,      unified_timestamp, +    update_Request, +    update_url_query, +    urljoin,      url_basename,      xpath_element,      xpath_text,      xpath_with_ns, -    determine_protocol, -    parse_duration, -    mimetype2ext, -    update_Request, -    update_url_query, -    parse_m3u8_attributes, -    extract_attributes, -    parse_codecs, -    urljoin,  ) @@ -714,6 +715,13 @@ class InfoExtractor(object):              video_info['title'] = video_title          return video_info +    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): +        urlrs = orderedSet( +            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) +            for m in matches) +        return self.playlist_result( +            urlrs, playlist_id=video_id, playlist_title=video_title) +      @staticmethod      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):          """Returns a playlist""" diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 97d68d9ca..40a5c9842 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -117,6 +117,7 @@ from .bleacherreport import (  from .blinkx import BlinkxIE  from .bloomberg import BloombergIE  from .bokecc import BokeCCIE +from .bostonglobe import BostonGlobeIE  from .bpb import BpbIE  from .br import BRIE  from .bravotv import BravoTVIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0fcb3fdac..a71d6bac0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor):          video_description = self._og_search_description(webpage, default=None)          video_thumbnail = self._og_search_thumbnail(webpage, default=None) -        # Helper method -        def _playlist_from_matches(matches, getter=None, ie=None): -            urlrs = orderedSet( -                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) -                for m in matches) -            return self.playlist_result( -                urlrs, playlist_id=video_id, playlist_title=video_title) -          # Look for Brightcove Legacy Studio embeds          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)          if bc_urls: @@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor):          # Look for Brightcove New Studio embeds          bc_urls = BrightcoveNewIE._extract_urls(webpage)          if bc_urls: -            return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +            return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')          # Look for ThePlatform embeds          tp_urls = ThePlatformIE._extract_urls(webpage)          if tp_urls: -            return _playlist_from_matches(tp_urls, ie='ThePlatform') +            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')          # Look for Vessel embeds          vessel_urls = VesselIE._extract_urls(webpage)          if vessel_urls: -            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) +            return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',              webpage)          if matches: -            return _playlist_from_matches(matches, ie='RtlNl') +            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')          vimeo_urls = VimeoIE._extract_urls(url, webpage)          if vimeo_urls: -            return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) +            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())          vid_me_embed_url = self._search_regex(              r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor):                  (?:embed|v|p)/.+?)              \1''', webpage)          if matches: -            return _playlist_from_matches( -                matches, lambda m: unescapeHTML(m[1])) +            return self.playlist_from_matches( +                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))          # Look for lazyYT YouTube embed          matches = re.findall(              r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)          if matches: -            return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) +            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))          # Look for Wordpress "YouTube Video Importer" plugin          matches = re.findall(r'''(?x)<div[^>]+              class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+              data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)          if matches: -            return _playlist_from_matches(matches, lambda m: m[-1]) +            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])          matches = DailymotionIE._extract_urls(webpage)          if matches: -            return _playlist_from_matches(matches) +            return self.playlist_from_matches(matches, video_id, video_title)          # Look for embedded Dailymotion playlist player (#3822)          m = re.search( @@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor):              playlists = re.findall(                  r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))              if playlists: -                return _playlist_from_matches( -                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p) +                return self.playlist_from_matches( +                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)          # Look for embedded Wistia player          match = re.search( @@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor):          if mobj is not None:              embeds = self._parse_json(mobj.group(1), video_id, fatal=False)              if embeds: -                return _playlist_from_matches( -                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') +                return self.playlist_from_matches( +                    embeds, video_id, video_title, +                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')          # Look for Aparat videos          mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor):          # Look for funnyordie embed          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)          if matches: -            return _playlist_from_matches( -                matches, getter=unescapeHTML, ie='FunnyOrDie') +            return self.playlist_from_matches( +                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')          # Look for BBC iPlayer embed          matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)          if matches: -            return _playlist_from_matches(matches, ie='BBCCoUk') +            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')          # Look for embedded RUTV player          rutv_url = RUTVIE._extract_url(webpage) @@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor):          # Look for embedded SportBox player          sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)          if sportbox_urls: -            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') +            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')          # Look for embedded XHamster player          xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)          if xhamster_urls: -            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') +            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')          # Look for embedded TNAFlixNetwork player          tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)          if tnaflix_urls: -            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) +            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())          # Look for embedded PornHub player          pornhub_urls = PornHubIE._extract_urls(webpage)          if pornhub_urls: -            return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) +            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())          # Look for embedded DrTuber player          drtuber_urls = DrTuberIE._extract_urls(webpage)          if drtuber_urls: -            return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) +            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())          # Look for embedded RedTube player          redtube_urls = RedTubeIE._extract_urls(webpage)          if redtube_urls: -            return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) +            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())          # Look for embedded Tvigle player          mobj = re.search( @@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor):          # Look for embedded soundcloud player          soundcloud_urls = SoundcloudIE._extract_urls(webpage)          if soundcloud_urls: -            return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) +            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())          # Look for tunein player          tunein_urls = TuneInBaseIE._extract_urls(webpage)          if tunein_urls: -            return _playlist_from_matches(tunein_urls) +            return self.playlist_from_matches(tunein_urls, video_id, video_title)          # Look for embedded mtvservices player          mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor):          # Look for DBTV embeds          dbtv_urls = DBTVIE._extract_urls(webpage)          if dbtv_urls: -            return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) +            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())          # Look for Videa embeds          videa_urls = VideaIE._extract_urls(webpage)          if videa_urls: -            return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) +            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())          # Look for 20 minuten embeds          twentymin_urls = TwentyMinutenIE._extract_urls(webpage)          if twentymin_urls: -            return _playlist_from_matches( -                twentymin_urls, ie=TwentyMinutenIE.ie_key()) +            return self.playlist_from_matches( +                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())          # Look for Openload embeds          openload_urls = OpenloadIE._extract_urls(webpage)          if openload_urls: -            return _playlist_from_matches( -                openload_urls, ie=OpenloadIE.ie_key()) +            return self.playlist_from_matches( +                openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())          # Look for VideoPress embeds          videopress_urls = VideoPressIE._extract_urls(webpage)          if videopress_urls: -            return _playlist_from_matches( -                videopress_urls, ie=VideoPressIE.ie_key()) +            return self.playlist_from_matches( +                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())          # Look for Rutube embeds          rutube_urls = RutubeIE._extract_urls(webpage)          if rutube_urls: -            return _playlist_from_matches( +            return self.playlist_from_matches(                  rutube_urls, ie=RutubeIE.ie_key())          # Looking for http://schema.org/VideoObject | 
