diff options
| author | remitamine <remitamine@gmail.com> | 2016-03-17 16:07:36 +0100 | 
|---|---|---|
| committer | remitamine <remitamine@gmail.com> | 2016-03-17 16:11:29 +0100 | 
| commit | 574b2a7393ef389792d5010704e505ef0eaaa5e8 (patch) | |
| tree | fe5f90c683f7de986a3028a4aba20cafc643d363 | |
| parent | 9f02ff537c6ddfd3f1ea3586f3e44f0ec07a2aea (diff) | |
[nbc:nbcnews] improve extraction(fixes #6922)
- extract more metadata and formats
- relax regex
| -rw-r--r-- | youtube_dl/extractor/nbc.py | 137 | 
1 files changed, 91 insertions, 46 deletions
| diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..bb0817e34 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE  from ..utils import ( -    ExtractorError,      find_xpath_attr,      lowercase_escape,      smuggle_url,      unescapeHTML, +    update_url_query, +    int_or_none, +    HEADRequest, +    parse_iso8601,  ) @@ -131,10 +134,10 @@ class NBCSportsIE(InfoExtractor):              NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class NBCNewsIE(ThePlatformIE):      _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/          (?:video/.+?/(?P<id>\d+)| -        (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) +        ([^/]+/)*(?P<display_id>[^/?]+))          '''      _TESTS = [ @@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor):              },          },          { -            'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', -            'md5': 'b2421750c9f260783721d898f4c42063', +            'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', +            'md5': 'af1adfa51312291a017720403826bb64',              'info_dict': { -                'id': 'I1wpAI_zmhsQ', +                'id': '269389891880',                  'ext': 'mp4',                  'title': 'How Twitter Reacted To The Snowden Interview',                  'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',              }, -            'add_ie': ['ThePlatform'],          },          {              'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,18 +170,30 @@ class NBCNewsIE(InfoExtractor):                  'title': 'FULL EPISODE: Family Business',                  'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',              }, +            'skip': 'This page is unavailable.',          },          {              'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', -            'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', +            'md5': '73135a2e0ef819107bbb55a5a9b2a802',              'info_dict': { -                'id': 'sekXqyTVnmN3', +                'id': '394064451844',                  'ext': 'mp4',                  'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',                  'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',              },          },          { +            'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', +            'md5': 'a49e173825e5fcd15c13fc297fced39d', +            'info_dict': { +                'id': '529953347624', +                'ext': 'mp4', +                'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', +                'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', +            }, +            'expected_warnings': ['http-6000 is not available'] +        }, +        {              'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',              'only_matching': True,          }, @@ -202,49 +216,80 @@ class NBCNewsIE(InfoExtractor):              }          else:              # "feature" and "nightly-news" pages use theplatform.com -            title = mobj.group('title') -            webpage = self._download_webpage(url, title) +            display_id = mobj.group('display_id') +            webpage = self._download_webpage(url, display_id) +            info = None              bootstrap_json = self._search_regex( -                r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', -                webpage, 'bootstrap json', flags=re.MULTILINE) -            bootstrap = self._parse_json(bootstrap_json, video_id) -            info = bootstrap['results'][0]['video'] -            mpxid = info['mpxId'] +                r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', +                webpage, 'bootstrap json', default=None) +            if bootstrap_json: +                bootstrap = self._parse_json(bootstrap_json, display_id) +                info = bootstrap['results'][0]['video'] +            else: +                player_instance_json = self._search_regex( +                    r'videoObj\s*:\s*({.+})', webpage, 'player instance') +                info = self._parse_json(player_instance_json, display_id) +            video_id = info['mpxId'] +            title = info['title'] -            base_urls = [ -                info['fallbackPlaylistUrl'], -                info['associatedPlaylistUrl'], -            ] +            subtitles = {} +            caption_links = info.get('captionLinks') +            if caption_links: +                for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): +                    sub_url = caption_links.get(sub_key) +                    if sub_url: +                        subtitles.setdefault('en', []).append({ +                            'url': sub_url, +                            'ext': sub_ext, +                        }) -            for base_url in base_urls: -                if not base_url: +            formats = [] +            for video_asset in info['videoAssets']: +                video_url = video_asset.get('publicUrl') +                if not video_url:                      continue -                playlist_url = base_url + '?form=MPXNBCNewsAPI' - -                try: -                    all_videos = self._download_json(playlist_url, title) -                except ExtractorError as ee: -                    if isinstance(ee.cause, compat_HTTPError): -                        continue -                    raise - -                if not all_videos or 'videos' not in all_videos: -                    continue - -                try: -                    info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) -                    break -                except StopIteration: +                container = video_asset.get('format') +                asset_type = video_asset.get('assetType') or '' +                if container == 'ISM' or asset_type == 'FireTV-Once':                      continue - -            if info is None: -                raise ExtractorError('Could not find video in playlists') +                elif asset_type == 'OnceURL': +                    tp_formats, tp_subtitles = self._extract_theplatform_smil( +                        video_url, video_id) +                    formats.extend(tp_formats) +                    subtitles = self._merge_subtitles(subtitles, tp_subtitles) +                else: +                    tbr = int_or_none(video_asset.get('bitRate'), 1000) +                    format_id = 'http%s' % ('-%d' % tbr if tbr else '') +                    video_url = update_url_query( +                        video_url, {'format': 'redirect'}) +                    # resolve the url so that we can check availability and detect the correct extension +                    head = self._request_webpage( +                        HEADRequest(video_url), video_id, +                        'Checking %s url' % format_id, +                        '%s is not available' % format_id, +                        fatal=False) +                    if head: +                        video_url = head.geturl() +                        formats.append({ +                            'format_id': format_id, +                            'url': video_url, +                            'width': int_or_none(video_asset.get('width')), +                            'height': int_or_none(video_asset.get('height')), +                            'tbr': tbr, +                            'container': video_asset.get('format'), +                        }) +            self._sort_formats(formats)              return { -                '_type': 'url', -                # We get the best quality video -                'url': info['videoAssets'][-1]['publicUrl'], -                'ie_key': 'ThePlatform', +                'id': video_id, +                'title': title, +                'description': info.get('description'), +                'thumbnail': info.get('description'), +                'thumbnail': info.get('thumbnail'), +                'duration': int_or_none(info.get('duration')), +                'timestamp': parse_iso8601(info.get('pubDate')), +                'formats': formats, +                'subtitles': subtitles,              } | 
