diff options
Diffstat (limited to 'youtube_dl/extractor/brightcove.py')
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 57 | 
1 files changed, 33 insertions, 24 deletions
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c7059a12a..124497e95 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -489,7 +489,7 @@ class BrightcoveNewIE(InfoExtractor):          return urls[0] if urls else None      @staticmethod -    def _extract_urls(webpage): +    def _extract_urls(ie, webpage):          # Reference:          # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe          # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag @@ -506,37 +506,46 @@ class BrightcoveNewIE(InfoExtractor):          # Look for <video> tags [2] and embed_in_page embeds [3]          # [2] looks like: -        # <video data-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> -          for video, script_tag, account_id, player_id, embed in re.findall( -            r'''(?isx) -                (<video[^>]+>) -                (?:.*? -                  (<script[^>]+ -                    src=["\'](?:https?:)?//players\.brightcove\.net/ -                    (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js -                  ) -                )? -            ''', webpage -        ): +                r'''(?isx) +                    (<video\s+[^>]+>) +                    (?:.*? +                        (<script[^>]+ +                            src=["\'](?:https?:)?//players\.brightcove\.net/ +                            (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js +                        ) +                    )? +                ''', webpage):              attrs = extract_attributes(video)              # According to examples from [4] it's unclear whether video id              # may be optional and what to do when it is              video_id = attrs.get('data-video-id') -            # See PR#12099/bostonglobe.py for 'data-brightcove-video-id' variant +            if not video_id: +                continue +            account_id = account_id or attrs.get('data-account')              if not account_id: -                account_id = attrs.get('data-account') -            if not player_id: -                player_id = attrs.get('data-player') -            if not embed: -                embed = attrs.get('data-embed') - -            if video_id and account_id and player_id and embed: -                entries.append( -                    'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' -                    % (account_id, player_id, embed, video_id)) +                continue + +            player_id = player_id or attrs.get('data-player') or 'default' +            embed = embed or attrs.get('data-embed') or 'default' + +            bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( +                account_id, player_id, embed, video_id) + +            # Some brightcove videos may be embedded with video tag only and +            # without script tag or any mentioning of brightcove at all. Such +            # embeds are considered ambiguous since they are matched based only +            # on data-video-id and data-account attributes and in the wild may +            # not be brightcove embeds at all. Let's check reconstructed +            # brightcove URLs in case of such embeds and only process valid +            # ones. By this we ensure there is indeed a brightcove embed. +            if not script_tag and not ie._is_valid_url( +                    bc_url, video_id, 'possible brightcove video'): +                continue + +            entries.append(bc_url)          return entries  | 
