diff options
| author | TRox1972 <archcr8@gmail.com> | 2016-05-17 16:21:52 +0200 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-06-03 22:17:35 +0700 | 
| commit | 9c3c447eb389726d98189d972a2d772ef729132d (patch) | |
| tree | d331bee9e4cc9c94fcb84e748e8921aca16320ed | |
| parent | ad73083ff05eafa64e07500fd42306ac349bd76b (diff) | |
[loc] Add extractor (Closes #3188)
Added extractor of loc.gov, which closes #3188. I am not an experienced programmer, so I am sure I did a bunch of mistakes, but the extractor works (for me at least).
[LibraryOfCongress] don't use video_id for _search_regex()
[LibraryOfCongress] Improvements
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/libraryofcongress.py | 65 | 
2 files changed, 66 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9dd55bd70..3b5143ace 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -382,6 +382,7 @@ from .leeco import (      LePlaylistIE,      LetvCloudIE,  ) +from .libraryofcongress import LibraryOfCongressIE  from .libsyn import LibsynIE  from .lifenews import (      LifeNewsIE, diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py new file mode 100644 index 000000000..0c34dbce3 --- /dev/null +++ b/youtube_dl/extractor/libraryofcongress.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import determine_ext + + +class LibraryOfCongressIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?loc\.gov/item/(?P<id>[0-9]+)' +    _TESTS = [{ +    'url': 'http://loc.gov/item/90716351/', +        'info_dict': { +            'id': '90716351', +            'ext': 'mp4', +            'title': 'Pa\'s trip to Mars /' +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        } +    }, { +        'url': 'https://www.loc.gov/item/97516576/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        self.report_extraction(video_id) +        json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, 'json id') + +        data = self._parse_json(self._download_webpage( +            'https://media.loc.gov/services/v1/media?id=%s' % json_id, +            video_id), video_id) +        data = data['mediaObject'] + +        media_url = data['derivatives'][0]['derivativeUrl'] +        media_url = media_url.replace('rtmp', 'https') + +        is_video = data['mediaType'].lower() == 'v' +        if not determine_ext(media_url) in ('mp4', 'mp3'): +            media_url += '.mp4' if is_video else '.mp3' + +        if media_url.index('vod/mp4:') > -1: +            media_url = media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8' +        elif url.index('vod/mp3:') > -1: +            media_url = media_url.replace('vod/mp3:', '') + +        formats = [] +        if determine_ext(media_url) == 'm3u8': +            formats = self._extract_m3u8_formats(media_url, video_id, ext='mp4') +        elif determine_ext(media_url) is 'mp3': +            formats.append({ +                'url': media_url, +                'ext': 'mp3', +            }) + +        return { +            'id': video_id, +            'thumbnail': self._og_search_thumbnail(webpage), +            'title': self._og_search_title(webpage), +            'formats': formats, +        } | 
