diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/appletrailers.py | 167 | 
2 files changed, 168 insertions, 0 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 27bbcc0f7..2f86f2aca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,4 @@ +from .appletrailers import AppleTrailersIE  from .archiveorg import ArchiveOrgIE  from .ard import ARDIE  from .arte import ArteTvIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..7d126e2d2 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,167 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    ExtractorError, +) + + +class AppleTrailersIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' +    _TEST = { +        u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", +        u"playlist": [ +            { +                u"file": u"manofsteel-trailer4.mov", +                u"md5": u"11874af099d480cc09e103b189805d5f", +                u"info_dict": { +                    u"duration": 111, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", +                    u"title": u"Trailer 4", +                    u"upload_date": u"20130523", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-trailer3.mov", +                u"md5": u"07a0a262aae5afe68120eed61137ab34", +                u"info_dict": { +                    u"duration": 182, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", +                    u"title": u"Trailer 3", +                    u"upload_date": u"20130417", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-trailer.mov", +                u"md5": u"e401fde0813008e3307e54b6f384cff1", +                u"info_dict": { +                    u"duration": 148, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", +                    u"title": u"Trailer", +                    u"upload_date": u"20121212", +                    u"uploader_id": u"wb", +                }, +            }, +            { +                u"file": u"manofsteel-teaser.mov", +                u"md5": u"76b392f2ae9e7c98b22913c10a639c97", +                u"info_dict": { +                    u"duration": 93, +                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", +                    u"title": u"Teaser", +                    u"upload_date": u"20120721", +                    u"uploader_id": u"wb", +                }, +            } +        ] +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        movie = mobj.group('movie') +        uploader_id = mobj.group('company') + +        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' +        playlist_snippet = self._download_webpage(playlist_url, movie) +        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) +        playlist_html = u'<html>' + playlist_cleaned + u'</html>' + +        size_cache = {} + +        doc = xml.etree.ElementTree.fromstring(playlist_html) +        playlist = [] +        for li in doc.findall('./div/ul/li'): +            title = li.find('.//h3').text +            video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() +            thumbnail = li.find('.//img').attrib['src'] + +            date_el = li.find('.//p') +            upload_date = None +            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) +            if m: +                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') +            runtime_el = date_el.find('./br') +            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) +            duration = None +            if m: +                duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + +            formats = [] +            for formats_el in li.findall('.//li/a'): +                if formats_el.attrib['class'] != 'OverlayPanel': +                    continue +                target = formats_el.attrib['target'] + +                format_code = formats_el.text +                if 'Automatic' in format_code: +                    continue + +                size_q = formats_el.attrib['href'] +                size_id = size_q.rpartition('#videos-')[2] +                if size_id not in size_cache: +                    size_url = url + size_q +                    sizepage_html = self._download_webpage( +                        size_url, movie, +                        note=u'Downloading size info %s' % size_id, +                        errnote=u'Error while downloading size info %s' % size_id, +                    ) +                    _doc = xml.etree.ElementTree.fromstring(sizepage_html) +                    size_cache[size_id] = _doc + +                sizepage_doc = size_cache[size_id] +                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') +                for vid_a in links: +                    href = vid_a.get('href') +                    if not href.endswith(target): +                        continue +                    detail_q = href.partition('#')[0] +                    detail_url = url + '/' + detail_q + +                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) +                    detail_id = m.group('detail_id') + +                    detail_html = self._download_webpage( +                        detail_url, movie, +                        note=u'Downloading detail %s %s' % (detail_id, size_id), +                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) +                    ) +                    detail_doc = xml.etree.ElementTree.fromstring(detail_html) +                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') +                    assert movie_link_el.get('class') == 'movieLink' +                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') +                    ext = determine_ext(movie_link) +                    assert ext == 'mov' + +                    formats.append({ +                        'format': format_code, +                        'ext': ext, +                        'url': movie_link, +                    }) + +            info = { +                '_type': 'video', +                'id': video_id, +                'title': title, +                'formats': formats, +                'title': title, +                'duration': duration, +                'thumbnail': thumbnail, +                'upload_date': upload_date, +                'uploader_id': uploader_id, +                'user_agent': 'QuickTime compatible (youtube-dl)', +            } +            # TODO: Remove when #980 has been merged +            info['url'] = formats[-1]['url'] +            info['ext'] = formats[-1]['ext'] + +            playlist.append(info) + +        return { +            '_type': 'playlist', +            'id': movie, +            'entries': playlist, +        } | 
