diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-04-15 20:51:47 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-04-15 20:59:05 +0700 | 
| commit | b2a19e38293206a4ff687315baf0369c205bcd6b (patch) | |
| tree | 4dc855d3fe6c2117d46a7e96ee3ad3c2b4f06e2a | |
| parent | 3266d08af29bbd6078aca172741458ddee180ab9 (diff) | |
[wsj] Improve and modernize (closes #12558)
| -rw-r--r-- | youtube_dl/extractor/wsj.py | 40 | 
1 files changed, 23 insertions, 17 deletions
| diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index ec38a2ad8..45cfca7c5 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -11,12 +11,13 @@ from ..utils import (  class WSJIE(InfoExtractor):      _VALID_URL = r'''(?x) -        (?: -            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| -            https?://(?:www\.)?wsj\.com/video/[^/]+/| -            wsj: -        ) -        (?P<id>[a-zA-Z0-9-]+)''' +                        (?: +                            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| +                            https?://(?:www\.)?wsj\.com/video/[^/]+/| +                            wsj: +                        ) +                        (?P<id>[a-fA-F0-9-]{36}) +                    '''      IE_DESC = 'Wall Street Journal'      _TESTS = [{          'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', @@ -39,12 +40,17 @@ class WSJIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        api_url = ( -            'http://video-api.wsj.com/api-video/find_all_videos.asp?' -            'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' -            'thumbnailList,author,description,name,duration,videoURL,' -            'titletag,formattedCreationDate,keywords,editor' % video_id) -        info = self._download_json(api_url, video_id)['items'][0] +        info = self._download_json( +            'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, +            query={ +                'type': 'guid', +                'count': 1, +                'query': video_id, +                'fields': ','.join(( +                    'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', +                    'description', 'name', 'duration', 'videoURL', 'titletag', +                    'formattedCreationDate', 'keywords', 'editor')), +            })['items'][0]          title = info.get('name', info.get('titletag'))          formats = [] @@ -91,8 +97,8 @@ class WSJIE(InfoExtractor):  class WSJArticleIE(InfoExtractor): -    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)' -    _TESTS = [{ +    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' +    _TEST = {          'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',          'info_dict': {              'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', @@ -101,11 +107,11 @@ class WSJArticleIE(InfoExtractor):              'uploader_id': 'ralcaraz',              'title': 'Bao Bao the Panda Leaves for China',          } -    }] +    }      def _real_extract(self, url):          article_id = self._match_id(url)          webpage = self._download_webpage(url, article_id) -        video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)', -                                      webpage, 'video id') +        video_id = self._search_regex( +            r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')          return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) | 
