diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/adultswim.py | 139 | ||||
| -rw-r--r-- | youtube_dl/extractor/comedycentral.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/dfb.py | 44 | ||||
| -rw-r--r-- | youtube_dl/extractor/mlb.py | 102 | ||||
| -rw-r--r-- | youtube_dl/extractor/npo.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/redtube.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/rtbf.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 44 | 
9 files changed, 320 insertions, 26 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78b95c2a5..f78aa066f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,5 +1,6 @@  from .academicearth import AcademicEarthCourseIE  from .addanime import AddAnimeIE +from .adultswim import AdultSwimIE  from .aftonbladet import AftonbladetIE  from .anitube import AnitubeIE  from .aol import AolIE @@ -63,6 +64,7 @@ from .dailymotion import (      DailymotionUserIE,  )  from .daum import DaumIE +from .dfb import DFBIE  from .dotsub import DotsubIE  from .dreisat import DreiSatIE  from .drtv import DRTVIE @@ -171,6 +173,7 @@ from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE, OCWMITIE  from .mixcloud import MixcloudIE +from .mlb import MLBIE  from .mpora import MporaIE  from .mofosex import MofosexIE  from .mooshare import MooshareIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py new file mode 100644 index 000000000..a00bfcb35 --- /dev/null +++ b/youtube_dl/extractor/adultswim.py @@ -0,0 +1,139 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class AdultSwimIE(InfoExtractor): +    _VALID_URL = r'https?://video\.adultswim\.com/(?P<path>.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$' +    _TEST = { +        'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title', +        'playlist': [ +            { +                'md5': '4da359ec73b58df4575cd01a610ba5dc', +                'info_dict': { +                    'id': '8a250ba1450996e901453d7f02ca02f5', +                    'ext': 'flv', +                    'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 1', +                    'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', +                    'uploader': 'Rick and Morty', +                    'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' +                } +            }, +            { +                'md5': 'ffbdf55af9331c509d95350bd0cc1819', +                'info_dict': { +                    'id': '8a250ba1450996e901453d7f4bd102f6', +                    'ext': 'flv', +                    'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 2', +                    'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', +                    'uploader': 'Rick and Morty', +                    'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' +                } +            }, +            { +                'md5': 'b92409635540304280b4b6c36bd14a0a', +                'info_dict': { +                    'id': '8a250ba1450996e901453d7fa73c02f7', +                    'ext': 'flv', +                    'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 3', +                    'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', +                    'uploader': 'Rick and Morty', +                    'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' +                } +            }, +            { +                'md5': 'e8818891d60e47b29cd89d7b0278156d', +                'info_dict': { +                    'id': '8a250ba1450996e901453d7fc8ba02f8', +                    'ext': 'flv', +                    'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 4', +                    'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', +                    'uploader': 'Rick and Morty', +                    'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' +                } +            } +        ] +    } + +    _video_extensions = { +        '3500': 'flv', +        '640': 'mp4', +        '150': 'mp4', +        'ipad': 'm3u8', +        'iphone': 'm3u8' +    } +    _video_dimensions = { +        '3500': (1280, 720), +        '640': (480, 270), +        '150': (320, 180) +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_path = mobj.group('path') + +        webpage = self._download_webpage(url, video_path) +        episode_id = self._html_search_regex(r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>', webpage, 'episode_id') +        title = self._og_search_title(webpage) + +        index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id +        idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index') + +        episode_el = idoc.find('.//episode') +        show_title = episode_el.attrib.get('collectionTitle') +        episode_title = episode_el.attrib.get('title') +        thumbnail = episode_el.attrib.get('thumbnailUrl') +        description = episode_el.find('./description').text.strip() + +        entries = [] +        segment_els = episode_el.findall('./segments/segment') + +        for part_num, segment_el in enumerate(segment_els): +            segment_id = segment_el.attrib.get('id') +            segment_title = '%s %s part %d' % (show_title, episode_title, part_num + 1) +            thumbnail = segment_el.attrib.get('thumbnailUrl') +            duration = segment_el.attrib.get('duration') + +            segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id +            idoc = self._download_xml(segment_url, segment_title, 'Downloading segment information', 'Unable to download segment information') + +            formats = [] +            file_els = idoc.findall('.//files/file') + +            for file_el in file_els: +                bitrate = file_el.attrib.get('bitrate') +                type = file_el.attrib.get('type') +                width, height = self._video_dimensions.get(bitrate, (None, None)) +                formats.append({ +                    'format_id': '%s-%s' % (bitrate, type), +                    'url': file_el.text, +                    'ext': self._video_extensions.get(bitrate, 'mp4'), +                    # The bitrate may not be a number (for example: 'iphone') +                    'tbr': int(bitrate) if bitrate.isdigit() else None, +                    'height': height, +                    'width': width +                }) + +            self._sort_formats(formats) + +            entries.append({ +                'id': segment_id, +                'title': segment_title, +                'formats': formats, +                'uploader': show_title, +                'thumbnail': thumbnail, +                'duration': duration, +                'description': description +            }) + +        return { +            '_type': 'playlist', +            'id': episode_id, +            'display_id': video_path, +            'entries': entries, +            'title': '%s %s' % (show_title, episode_title), +            'description': description, +            'thumbnail': thumbnail +        } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 8af0abade..c81ce5a96 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,13 +14,13 @@ from ..utils import (  class ComedyCentralIE(MTVServicesInfoExtractor): -    _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ -        (video-clips|episodes|cc-studios|video-collections) +    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ +        (video-clips|episodes|cc-studios|video-collections|full-episodes)          /(?P<title>.*)'''      _FEED_URL = 'http://comedycentral.com/feeds/mrss/'      _TEST = { -        'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', +        'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',          'md5': 'c4f48e9eda1b16dd10add0744344b6d8',          'info_dict': {              'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py new file mode 100644 index 000000000..cb8e06822 --- /dev/null +++ b/youtube_dl/extractor/dfb.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DFBIE(InfoExtractor): +    IE_NAME = 'tv.dfb.de' +    _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/', +        # The md5 is different each time +        'info_dict': { +            'id': '9070', +            'ext': 'flv', +            'title': 'Highlights des Empfangs in Berlin', +            'upload_date': '20140716', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) +        player_info = self._download_xml( +            'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, +            video_id) +        video_info = player_info.find('video') + +        f4m_info = self._download_xml(video_info.find('url').text, video_id) +        token_el = f4m_info.find('token') +        manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' + +        return { +            'id': video_id, +            'title': video_info.find('title').text, +            'url': manifest_url, +            'ext': 'flv', +            'thumbnail': self._og_search_thumbnail(webpage), +            'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]), +        } diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py new file mode 100644 index 000000000..18ab2c135 --- /dev/null +++ b/youtube_dl/extractor/mlb.py @@ -0,0 +1,102 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    parse_duration, +    parse_iso8601, +    find_xpath_attr, +) + + +class MLBIE(InfoExtractor): +    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)' +    _TESTS = [ +        { +            'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', +            'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', +            'info_dict': { +                'id': '34496663', +                'ext': 'mp4', +                'title': 'Stanton prepares for Derby', +                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', +                'duration': 46, +                'timestamp': 1405105800, +                'upload_date': '20140711', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +        { +            'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', +            'md5': '0e6e73d509321e142409b695eadd541f', +            'info_dict': { +                'id': '34578115', +                'ext': 'mp4', +                'title': 'Cespedes repeats as Derby champ', +                'description': 'md5:08df253ce265d4cf6fb09f581fafad07', +                'duration': 488, +                'timestamp': 1405399936, +                'upload_date': '20140715', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +        { +            'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', +            'md5': 'b8fd237347b844365d74ea61d4245967', +            'info_dict': { +                'id': '34577915', +                'ext': 'mp4', +                'title': 'Bautista on Home Run Derby', +                'description': 'md5:b80b34031143d0986dddc64a8839f0fb', +                'duration': 52, +                'timestamp': 1405390722, +                'upload_date': '20140715', +                'thumbnail': 're:^https?://.*\.jpg$', +            }, +        }, +    ] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        detail = self._download_xml( +            'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' +            % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) + +        title = detail.find('./headline').text +        description = detail.find('./big-blurb').text +        duration = parse_duration(detail.find('./duration').text) +        timestamp = parse_iso8601(detail.attrib['date'][:-5]) + +        thumbnail = find_xpath_attr( +            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text + +        formats = [] +        for media_url in detail.findall('./url'): +            playback_scenario = media_url.attrib['playback_scenario'] +            fmt = { +                'url': media_url.text, +                'format_id': playback_scenario, +            } +            m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario) +            if m: +                fmt.update({ +                    'vbr': int(m.group('vbr')) * 1000, +                    'width': int(m.group('width')), +                    'height': int(m.group('height')), +                }) +            formats.append(fmt) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'formats': formats, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index fbcbe1f40..12e85a716 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -32,7 +32,7 @@ class NPOIE(InfoExtractor):              'http://e.omroep.nl/metadata/aflevering/%s' % video_id,              video_id,              # We have to remove the javascript callback -            transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) +            transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//.*$', r'\1', j)          )          token_page = self._download_webpage(              'http://ida.omroep.nl/npoplayer/i.js', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 4295cf93a..d1e12dd8d 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -35,9 +35,7 @@ class RedTubeIE(InfoExtractor):              r'<h1 class="videoTitle[^"]*">(.+?)</h1>',              webpage, u'title') -        video_thumbnail = self._html_search_regex( -            r'playerInnerHTML.+?<img\s+src="(.+?)"', -            webpage, u'thumbnail', fatal=False) +        video_thumbnail = self._og_search_thumbnail(webpage)          # No self-labeling, but they describe themselves as          # "Home of Videos Porno" diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 205f8a167..dce64e151 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -30,7 +30,7 @@ class RTBFIE(InfoExtractor):          page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)          data = json.loads(self._html_search_regex( -            r'<div class="js-player-embed" data-video="([^"]+)"', page, 'data video'))['data'] +            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']          video_url = data.get('downloadUrl') or data.get('url') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6123e1256..5449df8e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -347,8 +347,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          self.to_screen(u'RTMP download detected')      def _extract_signature_function(self, video_id, player_url, slen): -        id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', -                        player_url) +        id_m = re.match( +            r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$', +            player_url)          player_type = id_m.group('ext')          player_id = id_m.group('id') @@ -1220,31 +1221,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                          url += '&signature=' + url_data['sig'][0]                      elif 's' in url_data:                          encrypted_sig = url_data['s'][0] + +                        if not age_gate: +                            jsplayer_url_json = self._search_regex( +                                r'"assets":.+?"js":\s*("[^"]+")', +                                video_webpage, u'JS player URL') +                            player_url = json.loads(jsplayer_url_json) +                        if player_url is None: +                            player_url_json = self._search_regex( +                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', +                                video_webpage, u'age gate player URL') +                            player_url = json.loads(player_url_json) +                          if self._downloader.params.get('verbose'): -                            if age_gate: -                                if player_url is None: -                                    player_version = 'unknown' -                                else: +                            if player_url is None: +                                player_version = 'unknown' +                                player_desc = 'unknown' +                            else: +                                if player_url.endswith('swf'):                                      player_version = self._search_regex(                                          r'-(.+)\.swf$', player_url,                                          u'flash player', fatal=False) -                                player_desc = 'flash player %s' % player_version -                            else: -                                player_version = self._search_regex( -                                    r'html5player-(.+?)\.js', video_webpage, -                                    'html5 player', fatal=False) -                                player_desc = u'html5 player %s' % player_version +                                    player_desc = 'flash player %s' % player_version +                                else: +                                    player_version = self._search_regex( +                                        r'html5player-(.+?)\.js', video_webpage, +                                        'html5 player', fatal=False) +                                    player_desc = u'html5 player %s' % player_version                              parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))                              self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %                                  (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) -                        if not age_gate: -                            jsplayer_url_json = self._search_regex( -                                r'"assets":.+?"js":\s*("[^"]+")', -                                video_webpage, u'JS player URL') -                            player_url = json.loads(jsplayer_url_json) -                          signature = self._decrypt_signature(                              encrypted_sig, video_id, player_url, age_gate)                          url += '&signature=' + signature  | 
