diff options
Diffstat (limited to 'youtube_dl/extractor/nytimes.py')
| -rw-r--r-- | youtube_dl/extractor/nytimes.py | 82 | 
1 files changed, 76 insertions, 6 deletions
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 142c34256..2bb77ab24 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import hmac @@ -6,11 +7,13 @@ import base64  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      float_or_none,      int_or_none, -    parse_iso8601, +    js_to_json,      mimetype2ext, -    determine_ext, +    parse_iso8601, +    remove_start,  ) @@ -139,15 +142,82 @@ class NYTimesArticleIE(NYTimesBaseIE):              'uploader': 'Matthew Williams',          }      }, { +        'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', +        'md5': 'e0d52040cafb07662acf3c9132db3575', +        'info_dict': { +            'id': '100000004709062', +            'title': 'The Run-Up: ‘He Was Like an Octopus’', +            'ext': 'mp3', +            'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', +            'series': 'The Run-Up', +            'episode': '‘He Was Like an Octopus’', +            'episode_number': 20, +            'duration': 2130, +        } +    }, { +        'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', +        'info_dict': { +            'id': '100000004709479', +            'title': 'The Rise of Hitler', +            'ext': 'mp3', +            'description': 'md5:bce877fd9e3444990cb141875fab0028', +            'creator': 'Pamela Paul', +            'duration': 3475, +        }, +        'params': { +            'skip_download': True, +        }, +    }, {          'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',          'only_matching': True,      }] +    def _extract_podcast_from_json(self, json, page_id, webpage): +        podcast_audio = self._parse_json( +            json, page_id, transform_source=js_to_json) + +        audio_data = podcast_audio['data'] +        track = audio_data['track'] + +        episode_title = track['title'] +        video_url = track['source'] + +        description = track.get('description') or self._html_search_meta( +            ['og:description', 'twitter:description'], webpage) + +        podcast_title = audio_data.get('podcast', {}).get('title') +        title = ('%s: %s' % (podcast_title, episode_title) +                 if podcast_title else episode_title) + +        episode = audio_data.get('podcast', {}).get('episode') or '' +        episode_number = int_or_none(self._search_regex( +            r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) + +        return { +            'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, +            'url': video_url, +            'title': title, +            'description': description, +            'creator': track.get('credit'), +            'series': podcast_title, +            'episode': episode_title, +            'episode_number': episode_number, +            'duration': int_or_none(track.get('duration')), +        } +      def _real_extract(self, url): -        video_id = self._match_id(url) +        page_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage(url, page_id) -        video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id') +        video_id = self._search_regex( +            r'data-videoid=["\'](\d+)', webpage, 'video id', +            default=None, fatal=False) +        if video_id is not None: +            return self._extract_video_from_id(video_id) -        return self._extract_video_from_id(video_id) +        podcast_data = self._search_regex( +            (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script', +             r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), +            webpage, 'podcast data') +        return self._extract_podcast_from_json(podcast_data, page_id, webpage)  | 
