diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2015-02-03 10:58:28 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2015-02-03 10:58:28 +0100 | 
| commit | 9bb8e0a3f9276f65de38cda431bf72f7bd266693 (patch) | |
| tree | e7e6d9d96a107d4ac28338c58ad9a8bce98e6097 | |
| parent | 1a6373ef3960add0117f797e0afe3322352c1c52 (diff) | |
[wsj] Add new extractor (Fixes #4854)
| -rw-r--r-- | test/test_utils.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/wsj.py | 89 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 2 | 
5 files changed, 95 insertions, 1 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index 0ffccd35f..80c765bc4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -156,6 +156,9 @@ class TestUtil(unittest.TestCase):          self.assertEqual(              unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),              '20141126') +        self.assertEqual( +            unified_strdate('2/2/2015 6:47:40 PM', day_first=False), +            '20150202')      def test_find_xpath_attr(self):          testxml = '''<root> diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dcb14feb..5866a7617 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -554,6 +554,7 @@ from .wimp import WimpIE  from .wistia import WistiaIE  from .worldstarhiphop import WorldStarHipHopIE  from .wrzuta import WrzutaIE +from .wsj import WSJIE  from .xbef import XBefIE  from .xboxclips import XboxClipsIE  from .xhamster import XHamsterIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 653d793fc..602601b24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,6 +145,7 @@ class InfoExtractor(object):      thumbnail:      Full URL to a video thumbnail image.      description:    Full video description.      uploader:       Full name of the video uploader. +    creator:        The main artist who created the video.      timestamp:      UNIX timestamp of the moment the video became available.      upload_date:    Video upload date (YYYYMMDD).                      If not explicitly set, calculated from timestamp. diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py new file mode 100644 index 000000000..cbe3dc7be --- /dev/null +++ b/youtube_dl/extractor/wsj.py @@ -0,0 +1,89 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +) + + +class WSJIE(InfoExtractor): +    _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)' +    IE_DESC = 'Wall Street Journal' +    _TEST = { +        'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', +        'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', +        'info_dict': { +            'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', +            'ext': 'mp4', +            'upload_date': '20150202', +            'uploader_id': 'bbright', +            'creator': 'bbright', +            'categories': list,  # a long list +            'duration': 90, +            'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        bitrates = [128, 174, 264, 320, 464, 664, 1264] +        api_url = ( +            'http://video-api.wsj.com/api-video/find_all_videos.asp?' +            'type=guid&count=1&query=%s&' +            'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' +            'author,description,name,linkURL,videoStillURL,duration,videoURL,' +            'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' +            'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' +            'allthingsd-subsection,sm-section,sm-subsection,provider,' +            'formattedCreationDate,keywords,keywordsOmniture,column,editor,' +            'emailURL,emailPartnerID,showName,omnitureProgramName,' +            'omnitureVideoFormat,linkRelativeURL,touchCastID,' +            'omniturePublishDate,%s') % ( +                video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) +        info = self._download_json(api_url, video_id)['items'][0] + +        # Thumbnails are conveniently in the correct format already +        thumbnails = info.get('thumbnailList') +        creator = info.get('author') +        uploader_id = info.get('editor') +        categories = info.get('keywords') +        duration = int_or_none(info.get('duration')) +        upload_date = unified_strdate( +            info.get('formattedCreationDate'), day_first=False) +        title = info.get('name', info.get('titletag')) + +        formats = [{ +            'format_id': 'f4m', +            'format_note': 'f4m (meta URL)', +            'url': info['videoURL'], +        }] +        if info.get('hls'): +            formats.extend(self._extract_m3u8_formats( +                info['hls'], video_id, ext='mp4', +                preference=0, entry_protocol='m3u8_native')) +        for br in bitrates: +            field = 'video%dkMP4Url' % br +            if info.get(field): +                formats.append({ +                    'format_id': 'mp4-%d' % br, +                    'container': 'mp4', +                    'tbr': br, +                    'url': info[field], +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'formats': formats, +            'thumbnails': thumbnails, +            'creator': creator, +            'uploader_id': uploader_id, +            'duration': duration, +            'upload_date': upload_date, +            'title': title, +            'formats': formats, +            'categories': categories, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 251074bf5..8f5463f1c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -701,7 +701,7 @@ def unified_strdate(date_str, day_first=True):      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)      # Remove AM/PM + timezone -    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) +    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)      format_expressions = [          '%d %B %Y',  | 
