aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/nytimes.py
diff options
context:
space:
mode:
authorJohn Hawkinson <jhawk@mit.edu>2016-10-14 22:16:43 -0400
committerSergey M․ <dstftw@gmail.com>2016-10-16 17:31:55 +0700
commit74324a7ac2633869d80263f1c12d7f48928c06fe (patch)
tree05749029e8a6b601de198aed97280cf1c82e39ce /youtube_dl/extractor/nytimes.py
parentb0dfcab60ace0a99d9287b9e9674c60cc935d67b (diff)
[nytimes] Add support for podcasts
Diffstat (limited to 'youtube_dl/extractor/nytimes.py')
-rw-r--r--youtube_dl/extractor/nytimes.py70
1 files changed, 65 insertions, 5 deletions
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 142c34256..245d0e9a6 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import hmac
@@ -8,6 +9,7 @@ from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
+ js_to_json,
parse_iso8601,
mimetype2ext,
determine_ext,
@@ -96,6 +98,43 @@ class NYTimesBaseIE(InfoExtractor):
'thumbnails': thumbnails,
}
+ def _extract_podcast_from_json(self, json, page_id, webpage):
+ audio_data = self._parse_json(json, page_id, transform_source=js_to_json)['data']
+
+ description = audio_data['track'].get('description')
+ if not description:
+ description = self._html_search_meta(['og:description', 'twitter:description'], webpage)
+
+ episode_title = audio_data['track']['title']
+ episode_number = None
+ episode = audio_data['podcast']['episode'].split()
+ if episode:
+ episode_number = int_or_none(episode[-1])
+ video_id = episode[-1]
+ else:
+ video_id = page_id
+
+ podcast_title = audio_data['podcast']['title']
+ title = None
+ if podcast_title:
+ title = "%s: %s" % (podcast_title, episode_title)
+ else:
+ title = episode_title
+
+ info_dict = {
+ 'id': video_id,
+ 'title': title,
+ 'creator': audio_data['track'].get('credit'),
+ 'series': podcast_title,
+ 'episode': episode_title,
+ 'episode_number': episode_number,
+ 'url': audio_data['track']['source'],
+ 'duration': audio_data['track'].get('duration'),
+ 'description': description,
+ }
+
+ return info_dict
+
class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
@@ -139,15 +178,36 @@ class NYTimesArticleIE(NYTimesBaseIE):
'uploader': 'Matthew Williams',
}
}, {
+ 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
+ 'md5': 'e0d52040cafb07662acf3c9132db3575',
+ 'info_dict': {
+ 'id': '20',
+ 'title': "The Run-Up: \u2018He Was Like an Octopus\u2019",
+ 'ext': 'mp3',
+ 'description': 'We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’s campaign.',
+ }
+ }, {
+ 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
+ 'md5': '66fb5471d7ef15da98af176dc1af4cb9',
+ 'info_dict': {
+ 'id': 'inside-the-new-york-times-book-review-the-rise-of-hitler',
+ 'title': "The Rise of Hitler",
+ 'ext': 'mp3',
+ 'description': 'Adam Kirsch discusses Volker Ullrich\'s new biography of Hitler; Billy Collins talks about his latest collection of poems; and iO Tillett Wright on his new memoir, "Darling Days."',
+ }
+ }, {
'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
+ page_id = self._match_id(url)
- video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id')
+ webpage = self._download_webpage(url, page_id)
- return self._extract_video_from_id(video_id)
+ video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id', None, False)
+ if video_id is not None:
+ return self._extract_video_from_id(video_id)
+
+ data_json = self._html_search_regex(r'NYTD\.FlexTypes\.push\(({.*})\);', webpage, 'json data')
+ return self._extract_podcast_from_json(data_json, page_id, webpage)