diff options
| -rw-r--r-- | youtube_dl/extractor/pbs.py | 57 | 
1 files changed, 50 insertions, 7 deletions
| diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8d6f2dd3d..52ab2f158 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      ExtractorError,      determine_ext, @@ -376,6 +377,35 @@ class PBSIE(InfoExtractor):              'expected_warnings': ['HTTP Error 403: Forbidden'],          },          { +            'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', +            'info_dict': { +                'id': '3007193718', +                'ext': 'mp4', +                'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", +                'description': 'md5:37efbac85e0c09b009586523ec143652', +                'duration': 6292, +                'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', +            }, +            'params': { +                'skip_download': True, +            }, +            'expected_warnings': ['HTTP Error 403: Forbidden'], +        }, +        { +            'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', +            'info_dict': { +                'id': '3011407934', +                'ext': 'mp4', +                'title': 'Stories from the Stage - Road Trip', +                'duration': 1619, +                'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', +            }, +            'params': { +                'skip_download': True, +            }, +            'expected_warnings': ['HTTP Error 403: Forbidden'], +        }, +        {              'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',              'only_matching': True,          }, @@ -438,6 +468,7 @@ class PBSIE(InfoExtractor):                  r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer                  r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",                  r'<div[^>]+\bdata-cove-id=["\'](\d+)"',  # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ +                r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)',  # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/              ]              media_id = self._search_regex( @@ -472,7 +503,8 @@ class PBSIE(InfoExtractor):              if not url:                  url = self._og_search_url(webpage) -            mobj = re.match(self._VALID_URL, url) +            mobj = re.match( +                self._VALID_URL, self._proto_relative_url(url.strip()))          player_id = mobj.group('player_id')          if not display_id: @@ -482,13 +514,27 @@ class PBSIE(InfoExtractor):                  url, display_id, note='Downloading player page',                  errnote='Could not download player page')              video_id = self._search_regex( -                r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') +                r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', +                default=None) +            if not video_id: +                video_info = self._extract_video_data( +                    player_page, 'video data', display_id) +                video_id = compat_str( +                    video_info.get('id') or video_info['contentID'])          else:              video_id = mobj.group('id')              display_id = video_id          return video_id, display_id, None, description +    def _extract_video_data(self, string, name, video_id, fatal=True): +        return self._parse_json( +            self._search_regex( +                [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', +                 r'window\.videoBridge\s*=\s*({.+?});'], +                string, name, default='{}'), +            video_id, transform_source=js_to_json, fatal=fatal) +      def _real_extract(self, url):          video_id, display_id, upload_date, description = self._extract_webpage(url) @@ -519,11 +565,8 @@ class PBSIE(InfoExtractor):                  'http://player.pbs.org/%s/%s' % (page, video_id),                  display_id, 'Downloading %s page' % page, fatal=False)              if player: -                video_info = self._parse_json( -                    self._search_regex( -                        [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], -                        player, '%s video data' % page, default='{}'), -                    display_id, transform_source=js_to_json, fatal=False) +                video_info = self._extract_video_data( +                    player, '%s video data' % page, display_id, fatal=False)                  if video_info:                      extract_redirect_urls(video_info)                      if not info: | 
