diff options
| author | Remita Amine <remitamine@gmail.com> | 2016-08-05 16:40:21 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2016-08-05 16:42:15 +0100 | 
| commit | d50aca41f8e3aa3af8e19ec91100283b555ac59f (patch) | |
| tree | 985ad6d5cc14603601a41017a3e0cacb80f8cfdc | |
| parent | 0ca057b965e5699a88fc952da460b5adfb8e7644 (diff) | |
[archiveorg] improve format extraction(closes #10219)
| -rw-r--r-- | youtube_dl/extractor/archiveorg.py | 78 | 
1 files changed, 38 insertions, 40 deletions
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 8feb7cb74..2472e4cc6 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,67 +1,65 @@  from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import unified_strdate +from .jwplatform import JWPlatformBaseIE +from ..utils import ( +    unified_strdate, +    clean_html, +) -class ArchiveOrgIE(InfoExtractor): +class ArchiveOrgIE(JWPlatformBaseIE):      IE_NAME = 'archive.org'      IE_DESC = 'archive.org videos' -    _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$' +    _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'      _TESTS = [{          'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',          'md5': '8af1d4cf447933ed3c7f4871162602db',          'info_dict': {              'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', -            'ext': 'ogv', +            'ext': 'ogg',              'title': '1968 Demo - FJCC Conference Presentation Reel #1', -            'description': 'md5:1780b464abaca9991d8968c877bb53ed', +            'description': 'md5:da45c349df039f1cc8075268eb1b5c25',              'upload_date': '19681210',              'uploader': 'SRI International'          }      }, {          'url': 'https://archive.org/details/Cops1922', -        'md5': '18f2a19e6d89af8425671da1cf3d4e04', +        'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',          'info_dict': {              'id': 'Cops1922', -            'ext': 'ogv', +            'ext': 'mp4',              'title': 'Buster Keaton\'s "Cops" (1922)', -            'description': 'md5:70f72ee70882f713d4578725461ffcc3', +            'description': 'md5:b4544662605877edd99df22f9620d858',          } +    }, { +        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url) +        webpage = self._download_webpage( +            'http://archive.org/embed/' + video_id, video_id) +        jwplayer_playlist = self._parse_json(self._search_regex( +            r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", +            webpage, 'jwplayer playlist'), video_id) +        info = self._parse_jwplayer_data( +            {'playlist': jwplayer_playlist}, video_id, base_url=url) -        json_url = url + ('&' if '?' in url else '?') + 'output=json' -        data = self._download_json(json_url, video_id) - -        def get_optional(data_dict, field): -            return data_dict['metadata'].get(field, [None])[0] - -        title = get_optional(data, 'title') -        description = get_optional(data, 'description') -        uploader = get_optional(data, 'creator') -        upload_date = unified_strdate(get_optional(data, 'date')) +        def get_optional(metadata, field): +            return metadata.get(field, [None])[0] -        formats = [ -            { -                'format': fdata['format'], -                'url': 'http://' + data['server'] + data['dir'] + fn, -                'file_size': int(fdata['size']), -            } -            for fn, fdata in data['files'].items() -            if 'Video' in fdata['format']] - -        self._sort_formats(formats) - -        return { -            '_type': 'video', -            'id': video_id, -            'title': title, -            'formats': formats, -            'description': description, -            'uploader': uploader, -            'upload_date': upload_date, -            'thumbnail': data.get('misc', {}).get('image'), -        } +        metadata = self._download_json( +            'http://archive.org/details/' + video_id, video_id, query={ +                'output': 'json', +            })['metadata'] +        info.update({ +            'title': get_optional(metadata, 'title') or info.get('title'), +            'description': clean_html(get_optional(metadata, 'description')), +        }) +        if info.get('_type') != 'playlist': +            info.update({ +                'uploader': get_optional(metadata, 'creator'), +                'upload_date': unified_strdate(get_optional(metadata, 'date')), +            }) +        return info
\ No newline at end of file  | 
